def main(): df = table_utils._read(config.CLICK_DATA) \ .join(table_utils._read(config.RELEVANCE_DATA).set_index(['norm_query', 'hit_title']), on=['norm_query', 'hit_title'], how='inner') dfQ = df['norm_query'].drop_duplicates() queries = dfQ.reindex(np.random.permutation(dfQ.index))[:20] del dfQ condition = np.zeros((len(df)), dtype=bool) for q in queries: condition = condition | (df['norm_query'] == q) dfShort = df[condition] pd.set_option('display.width', 1000) for norm_query, query_group in dfShort.groupby(['norm_query']): hits = [] for title, hit_group in query_group.groupby(['hit_title']): num_clicks = np.sum(hit_group['clicked']) avg_position = np.mean(hit_group['hit_position']) relevance = np.mean(hit_group['relevance']) hits.append((title, num_clicks, avg_position, relevance)) queries = list(query_group['query'].drop_duplicates()) print("Normalized Query: %s" % (norm_query.encode('utf8'))) print("Queries:\n\t%s" % ("\n\t".join(map(lambda x: x.encode('utf8'), queries)))) hitDf = pd.DataFrame(hits, columns=['title', 'num_clicks', 'avg_pos', 'dbn_rel']) print(hitDf.sort_values(['dbn_rel'], ascending=False)) print("\n\n")
def main(): # uses inner to ensure we have complete rows. Otherwise # we would have NaN mixed in with the relevance scores, # and we can't train on those dfClicks = table_utils._read(config.CLICK_DATA) dfRel = table_utils._read(config.RELEVANCE_DATA) if config.REMOVE_WALL: dfRel = dfRel[(dfRel['relevance'] < 0.24) | (dfRel['relevance'] > 0.25)] dfAll = dfClicks \ .join(dfRel.set_index(['norm_query', 'hit_title']), on=['norm_query', 'hit_title'], how='inner') dfClicks_len = len(dfClicks) rows_w_rel = len(dfAll) del dfClicks del dfRel # Filter out pages that couldn't be loaded as docs/termvecs es_docs_keys = set(map(int, shelve_keys(config.ES_PAGE_DOCS_SHELVE))) dfAll = dfAll[dfAll['hit_page_id'].isin(es_docs_keys)] es_termvec_keys = set(map(int, shelve_keys(config.ES_PAGE_TERM_VEC_SHELVE))) dfAll = dfAll[dfAll['hit_page_id'].isin(es_termvec_keys)] # drop some unnecessary columns dfAll.drop(['session_id', 'clicked'], axis=1, inplace=True) # average out hit_position and hit_score to improve de-duplication. # on 10k queries this is a 90% reduction in data dfAll = dfAll.join(dfAll.groupby(['query', 'hit_page_id'])['hit_position'].mean(), on=['query', 'hit_page_id'], rsuffix='_mean') dfAll = dfAll.join(dfAll.groupby(['query', 'hit_page_id'])['hit_score'].mean(), on=['query', 'hit_page_id'], rsuffix='_mean') dfAll.drop(['hit_position', 'hit_score'], axis=1, inplace=True) # turn duplicates into a 'weight' column dfAll = dfAll.groupby(map(str, dfAll.columns)).size().reset_index() dfAll.rename(columns={0: 'weight'}, inplace=True) # Assign id's to all normalized queries, to be used to generate pairwise # preferences for the ranking learners norm_query_id_df = pd.DataFrame(list(enumerate(dfAll["norm_query"].unique())), columns=["norm_query_id", "norm_query"]) dfAll = dfAll.join(norm_query_id_df.set_index("norm_query"), on="norm_query") # xgboost requires the data to be in groups by id dfAll.sort_values("norm_query_id", inplace=True) table_utils._write(config.ALL_DATA, dfAll) dfInfo = dfAll[["relevance", "weight", "norm_query_id"]].copy() table_utils._write(config.INFO_DATA, dfInfo) print 'Source clicks len: %d' % (dfClicks_len) print 'Rows with relevance: %d' % (rows_w_rel) print 'Final data len: %d' % (len(dfAll)) print 'Ratio: %.3f' % (float(len(dfAll))/dfClicks_len)
def main(): #.join(table_utils._read(config.ES_PAGE_TERM_VEC), on='hit_page_id', how='left') \ #.join(table_utils._read(config.ES_QUERY_TERM_VEC), on='query', how='left') \ # This ends up also including the weight of many sessions against same (query, hit_title) # pairs in the correlation. Is that desirable? dfAll = table_utils._read(config.CLICK_DATA) \ .join(table_utils._read(config.ES_PAGE_DOCS), on='hit_page_id', how='left') \ .join(table_utils._read(config.RELEVANCE_DATA).set_index(['norm_query', 'hit_title']), on=['norm_query', 'hit_title'], how='left') #dfAll = dfAll[~dfAll['incoming_links'].isnull()] # Single value fields single_funcs = { 'id(%s)': lambda x: x, 'log(%s)': np.log, 'log2(%s)': np.log2, 'log1p(%s)': np.log1p, 'log10(%s)': np.log10, 'sqrt(%s)': np.sqrt, 'square(%s)': np.square, 'exp(%s)': np.exp, 'expm1(%s)': np.expm1, 'log(%s+2)': lambda x: np.log(x + 2), 'log(%s+1e-7)': lambda x: np.log(x + 1e-7), 'log(1-%s)': lambda x: np.log(1 - x), } corr_name = 'pearsonr' def corr_func(x, y): return stats.pearsonr(x, y)[0] pairs = [] dfAll = dfAll[~dfAll['relevance'].isnull()] # Correlations are much stronger when removing the "wall" at .24 <= x <= .25 # dfAll = dfAll[(dfAll['relevance'] > 0.25) | (dfAll['relevance'] < 0.24)] for field in ['hit_incoming_links', 'hit_popularity_score', 'hit_text_bytes']: for x_fname, x_func in single_funcs.iteritems(): dfClean = dfAll[~dfAll[field].isnull()] xt = dfClean[field].apply(x_func).values for y_fname, y_func in single_funcs.iteritems(): yt = dfClean["relevance"].apply(y_func).values corr = corr_func(xt.flatten(), yt) if not np.isnan(corr): pairs.append((abs(corr), "%s(%s, %s): %.6f" % (corr_name, x_fname % field, y_fname % 'relevance', corr))) print(".", end="") print("\n") for _, s in sorted(pairs, key=lambda tup: tup[0]): print(s)
def main(): logname = "generate_feature_ident.log" logger = logging_utils._get_logger(config.LOG_DIR, logname) dfAll = table_utils._read(config.ALL_DATA) # Copies of data from ES docs. Note that multi-valued fields are first # converted into their length obs_fields = ["incoming_links", "popularity_score", "text_bytes", "category", "template", "heading", "outgoing_link", "external_link", "redirect.title", "auxiliary_text"] transforms = [None, np.log, np.log10, np.sqrt] dedup = True for transform in transforms: param_list = [transform] sf = StandaloneFeatureWrapper(Ident, dfAll, obs_fields, param_list, config.FEAT_DIR, logger, dedup) sf.go() # Sub-fields from termvec data obs_fields = [x + '_termvec' for x in config.ES_TERM_FIELDS] obs_fields += ['query_' + x + '_termvec' for x in config.ES_TERM_FIELDS] obs_fields += ['norm_query_' + x + '_termvec' for x in config.ES_TERM_FIELDS] es_fields = ['score', 'term_freq', 'ttf', 'doc_freq'] aggregation_mode = ["mean", "std", "max", "min", "median"] for es_field in es_fields: for transform in transforms: param_list = [es_field, transform, aggregation_mode] sf = StandaloneFeatureWrapper(SubFieldIdent, dfAll, obs_fields, param_list, config.FEAT_DIR, logger, dedup) sf.go()
def run_ngram_jaccard(): logname = "generate_feature_ngram_jaccard.log" logger = logging_utils._get_logger(config.LOG_DIR, logname) dfAll = table_utils._read(config.ALL_DATA) generators = [JaccardCoef_Ngram, DiceDistance_Ngram] # single valued fields obs_fields_list = [["query", "norm_query"]] target_fields_list = [["hit_title", "opening_text" ]] ngrams = [1,2,3,12,123][:3] dedup = True for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): for generator in generators: for ngram in ngrams: param_list = [ngram] pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger, dedup) pf.go() # multi-valued fields target_fields_list = [["category", "template", "heading", "outgoing_link", "external_link", "redirect.title", "auxiliary_text"]] aggregation_mode = ["mean", "std", "max", "min", "median"] for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): for generator in generators: multi_gen = MultiTargetEstimatorWrapper(generator) for ngram in ngrams: param_list = [ngram, aggregation_mode] pf = PairwiseFeatureWrapper(multi_gen, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger, dedup) pf.go()
def main(): logname = "generate_feature_basic.log" logger = logging_utils._get_logger(config.LOG_DIR, logname) dfAll = table_utils._read(config.ALL_DATA) # basic generators = [DocLen, DocFreq, DocEntropy, DigitCount, DigitRatio] obs_fields = ["query", "norm_query", "hit_title", 'opening_text'] for generator in generators: param_list = [] dedup = False if generator == DocFreq else True sf = StandaloneFeatureWrapper(generator, dfAll, obs_fields, param_list, config.FEAT_DIR, logger, dedup) sf.go() # basic against multi-value fields obs_fields = [ 'category', 'template', 'heading', 'outgoing_link', 'external_link', 'redirect.title', 'auxiliary_text' ] aggregations = ['mean', 'std', 'max', 'min', 'median'] param_list = [aggregations] for generator in generators: multi_gen = MultiObjEstimatorWrapper(generator) dedup = False if generator == DocFreq else True sf = StandaloneFeatureWrapper(multi_gen, dfAll, obs_fields, param_list, config.FEAT_DIR, logger, dedup) sf.go() # unique count generators = [UniqueCount_Ngram, UniqueRatio_Ngram] obs_fields = ["query", "norm_query", "hit_title", 'opening_text'] ngrams = [1, 2, 3] for generator in generators: for ngram in ngrams: param_list = [ngram] dedup = True sf = StandaloneFeatureWrapper(generator, dfAll, obs_fields, param_list, config.FEAT_DIR, logger, dedup) sf.go() # unique count against multi-value fields generators = [UniqueCount_Ngram, UniqueRatio_Ngram] obs_fields = [ 'category', 'template', 'heading', 'outgoing_link', 'external_link', 'redirect.title', 'auxiliary_text' ] aggregations = ['mean', 'std', 'max', 'min', 'median'] ngrams = [1, 2, 3] for generator in generators: for ngram in ngrams: multi_gen = MultiObjEstimatorWrapper(generator) param_list = [ngram, aggregations] dedup = True sf = StandaloneFeatureWrapper(multi_gen, dfAll, obs_fields, param_list, config.FEAT_DIR, logger, dedup) sf.go()
def main(): queries = table_utils._read(config.CLICK_DATA)[['query', 'norm_query']] \ .stack().drop_duplicates().reset_index(drop=True).values url = config.ES_URL + '/page/_mtermvectors' session = requests.Session() reqs = [] for batch in np_utils._split(queries, 100): docs = [] for query in batch: doc = { 'doc': dict( zip(config.ES_TERM_FIELDS, [query] * len(config.ES_TERM_FIELDS))), 'fields': config.ES_TERM_FIELDS, 'positions': False, 'offsets': False, 'term_statistics': True, 'filter': { 'max_num_terms': 50 } } # Custom munging for the redirect.title field del doc['doc']['redirect.title'] doc['doc']['redirect'] = [{'namespace': 0, 'title': query}] docs.append(doc) # for *extra* funsies, elasticsearch isn't going to give us back the original # doc that we searched for, although it could be reconstructed from the tokens # under most circumstances. reqs.append( grequests.post(url, data=json.dumps({'docs': docs}), session=session, callback=batch_parse_termvec(batch))) data = table_utils._open_shelve_write(config.ES_QUERY_TERM_VEC_SHELVE) i = 0 try: with progressbar.ProgressBar(max_value=len(queries)) as bar: for r in grequests.imap(reqs, size=20, exception_handler=exception_handler): for query, vecs in r.data.iteritems(): # Can't directly use unicode strings until 3.x, only strings data[query.encode('utf8')] = vecs i += len(r.data) bar.update(i) finally: data.close()
def main(): # returns an ndarray page_ids = table_utils._read(config.CLICK_DATA)['hit_page_id'].unique() url = config.ES_URL + '/page/_mtermvectors' # shelve is a disk-backed dict, it's a good bit slower than pandas in-memory # implementation, but one large sets there is just too much data to hold in memory docs = table_utils._open_shelve_write(config.ES_PAGE_TERM_VEC_SHELVE) i = 0 try: with progressbar.ProgressBar(max_value=len(page_ids)) as bar: # doing it all in one grequests.imap seems to hold onto a ton # of extra memory. On a sample of ~200k page id's i was seeing # 14G+, but breaking into smaller pieces brought it down to 7G for top_batch in np_utils._split(page_ids, 10000): session = requests.Session() reqs = [] for batch in np_utils._split(top_batch, 100): data = { 'ids': list(batch), 'parameters': { 'fields': config.ES_TERM_FIELDS, 'positions': False, 'offsets': False, 'term_statistics': True, 'filter': { 'max_num_terms': config.TERM_VEC_MAX_NUM_TERMS, } } } reqs.append( grequests.post(url, data=json.dumps(data), session=session)) for r in grequests.imap(reqs, size=20, exception_handler=exception_handler): found = json.loads(r.text)['docs'] for d in json.loads(r.text)['docs']: # TODO: Store empty doc, or handle when reading? if d['found']: docs[str(d['_id'])] = es_utils.parse_termvec(d) i += len(found) bar.update(i) # sync every 10k docs frees up memory docs.sync() finally: docs.close()
def main(): # returns an ndarray page_ids = table_utils._read(config.CLICK_DATA)['hit_page_id'].unique() url = config.ES_URL + '/page/_mget' params = {'fields': ','.join(config.ES_DOC_FIELDS)} defaults = config.ES_DOC_FIELDS_DEFAULTS multi_value_fields = [ k for k, v in defaults.iteritems() if isinstance(v, tuple) ] docs = table_utils._open_shelve_write(config.ES_PAGE_DOCS_SHELVE) i = 0 try: with progressbar.ProgressBar(max_value=len(page_ids)) as bar: for top_batch in np_utils._split(page_ids, 10000): session = requests.Session() reqs = [] for batch in np_utils._split(top_batch, 100): data = json.dumps({'ids': list(batch)}) reqs.append( grequests.post(url, data=data, params=params, session=session)) for r in grequests.imap(reqs, size=20, exception_handler=exception_handler): found = json.loads(r.text)['docs'] for d in found: if not d['found']: continue res = defaults.copy() for field, v in d['fields'].iteritems(): # ES alwards returns a list, even if there is only one item. # Flatten down single valued fields res[field] = tuple( v) if field in multi_value_fields else v[0] docs[str(d['_id'])] = res i += len(found) bar.update(i) docs.sync() finally: docs.close()
def main(): logname = "generate_feature_tfidf.log" logger = logging_utils._get_logger(config.LOG_DIR, logname) obs_fields = ['query', 'norm_query'] target_fields = [x + '_termvec' for x in config.ES_TERM_FIELDS] dfAll = table_utils._read(config.ALL_DATA) docs = table_utils._open_shelve_read(config.ES_PAGE_TERM_VEC_SHELVE) queries = table_utils._open_shelve_read(config.ES_QUERY_TERM_VEC_SHELVE) generators = [ES_TFIDF_Unigram_TopN_CosineSim] dedup = True for generator in generators: for target_field in target_fields: obs_fields_tv = [x + '_' + target_field for x in obs_fields] param_list = [] # TODO: why iterate obs_fields instead of passing all at once? pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields_tv, [target_field], param_list, config.FEAT_DIR, logger, dedup) pf.go()
def run_edit_distance(): logname = "generate_feature_edit_distance.log" logger = logging_utils._get_logger(config.LOG_DIR, logname) dfAll = table_utils._read(config.ALL_DATA) # single value targets obs_fields_list = [["query", "norm_query"]] target_fields_list = [["hit_title", "opening_text"]] dedup = True for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): param_list = [] pf = PairwiseFeatureWrapper(EditDistance, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger, dedup) pf.go() # multi-value targets target_fields_list = [["category", "template", "heading", "outgoing_link", "external_link", "redirect.title", "auxiliary_text"]] aggregation_mode = ["mean", "std", "max", "min", "median"] for obs_fields, target_fields in zip(obs_fields_list, target_fields_list): param_list = [aggregation_mode] multi_gen = MultiTargetEstimatorWrapper(EditDistance) pf = PairwiseFeatureWrapper(multi_gen, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger, dedup) pf.go()
import pandas as pd import feather import os import timeit import config from utils import table_utils df = table_utils._read(config.ALL_DATA) FILE_HDF = os.path.join(config.TMP_DIR, 'test.h5') FILE_PICKLE = os.path.join(config.TMP_DIR, 'test.pkl') FILE_FEATHER = os.path.join(config.TMP_DIR, 'test.feather') def test_hdf_write(): df.to_hdf(FILE_HDF, 'test', mode='w') def test_hdf_read(): pd.read_hdf(FILE_HDF, 'test') def test_pickle_write(): df.to_pickle(FILE_PICKLE) def test_pickle_read(): pd.read_pickle(FILE_PICKLE) def test_feather_write(): feather.write_dataframe(df.copy(), FILE_FEATHER) def test_feather_read(): feather.read_dataframe(FILE_FEATHER)
def _load_data_dict(self): fname = os.path.join(config.FEAT_DIR+"/Combine", self.feature_name+config.FEAT_FILE_SUFFIX) data_dict = table_utils._read(fname) return data_dict
def main(): df = table_utils._read(config.CLICK_DATA) # TODO: using only session_id might be more efficient, but (query, session_id) # is more obvious to debug grouped = df.groupby(['norm_query', 'session_id']) clickmodel_data_file = os.path.join(config.TMP_DIR, 'clickmodel.txt') pos = 0 with codecs.open(clickmodel_data_file, 'w', 'utf8') as f: with progressbar.ProgressBar() as bar: for (norm_query, session_id), group in bar(grouped): assert "\t" not in norm_query assert type(norm_query) == unicode results = [] clicks = [] # TODO: groupby still necessary? check all group lengths, they might be 1 # after adjustments to the source hive query for title, rows in group \ .sort_values(['hit_score'], ascending=False) \ .groupby(['hit_title'], sort=False): # TODO: using page id instead of title might be more efficient, # but title is easier for debugging results.append(title) clicks.append(True in list(rows['clicked'])) if len(results) >= config.DBN_CONFIG['MAX_DOCS_PER_QUERY']: break # exclude groups with no clicks if not any(clicks): continue # exclude too small result sets as well if len(results) < config.DBN_CONFIG['MIN_DOCS_PER_QUERY']: continue f.write("\t".join([ str(pos), # hash digest norm_query, # query '0', # region '0', # intent weight json.dumps(results), # urls json.dumps([False] * len(results)), # layout json.dumps(clicks) # clicks ]) + "\n") pos += 1 del df with codecs.open(clickmodel_data_file, 'r', 'utf8') as f: reader = InputReader(config.DBN_CONFIG['MIN_DOCS_PER_QUERY'], config.DBN_CONFIG['MAX_DOCS_PER_QUERY'], False, config.DBN_CONFIG['SERP_SIZE'], False, discard_no_clicks=True) sessions = reader(f) dbn_config = config.DBN_CONFIG.copy() dbn_config['MAX_QUERY_ID'] = reader.current_query_id + 1 model = DbnModel((0.9, 0.9, 0.9, 0.9), config=dbn_config) model.train(sessions) f.seek(0) results = [] # This is a bit ugly and hackish ... but trying to not explode memory # by flipping the giant url_to_id and query_to_id dicts. seen = set() # hax with progressbar.ProgressBar(max_value=pos) as bar: pos = 0 for line in f: bar.update(pos) pos += 1 _, norm_query, _, _, titles, _, clicks = line.rstrip().split( '\t') titles = json.loads(titles) if len(titles) < dbn_config['MIN_DOCS_PER_QUERY']: continue query_id = reader.query_to_id[(norm_query, "0")] title_ids = [reader.url_to_id[t] for t in titles] session = SessionItem(0, query_id, title_ids, 0, [], {}) relevances = model.get_model_relevances(session) for title, relevance in zip(titles, relevances): if (norm_query, title) in seen: continue results.append([norm_query, title, relevance]) # alternatly could use drop_duplicates, not sure which # is cheaper on memory usage seen.add((norm_query, title)) df = pd.DataFrame(results, columns=['norm_query', 'hit_title', 'relevance']) print 'Hits with relevance: %d' % (len(results)) table_utils._write(config.RELEVANCE_DATA, df)
def combine(self): dfAll = table_utils._read(config.INFO_DATA) dfAll_raw = dfAll.copy() y = dfAll['relevance'].values feat_cnt = 0 self.logger.info('Run for basic...') for file_name in sorted(os.listdir(config.FEAT_DIR)): if not config.FEAT_FILE_SUFFIX in file_name: continue fname = os.path.splitext(file_name)[0] if fname not in self.feature_dict: continue x = self.load_feature(config.FEAT_DIR, fname) x = np.nan_to_num(x) # Still necessary? if np.isnan(x).any(): self.logger.info("%s nan" % (fname)) continue # Apply feature transformers (?) mandatory = self.feature_dict[fname][0] transformer = self.feature_dict[fname][1] x = transformer.fit_transform(x) dim = np_utils._dim(x) if dim == 1: corr = np_utils._corr(x, y) if not mandatory and (np.isnan(corr) or abs(corr) < self.corr_threshold): self.logger.info( "Drop: {} ({}D) (abs_corr = {}, < threshold {})". format(fname, dim, abs(corr), self.corr_threshold)) continue dfAll[fname] = x self.feature_names.append(fname) else: columns = ["%s_%d" % (fname, x) for x in range(dim)] df = pd.DataFrame(x, columns=columns) dfAll = pd.concat([dfAll, df], axis=1) self.feature_names.extend(columns) feat_cnt += 1 self.feature_names_basic.append(fname) if dim == 1: self.logger.info( "Combine {:>3}/{:>3} feat: {} ({}D) (corr = {})".format( feat_cnt, len(self.feature_dict.keys()), fname, dim, corr)) else: self.logger.info("Combine {:>3}/{:>3} feat: {} ({}D)".format( feat_cnt, len(self.feature_dict.keys()), fname, dim)) dfAll.fillna(config.MISSING_VALUE_NUMERIC, inplace=True) self.y = dfAll["relevance"].values.astype(float) self.weights = dfAll['weight'].values self.query_ids = dfAll['norm_query_id'].values dfAll.drop(["relevance", "weight", "norm_query_id"], axis=1, inplace=True) self.X = dfAll.values.astype(float) self.logger.info("Overall Shape: %d x %d" % (len(self.y), self.X.shape[1])) self.logger.info("Done combining")
def load_feature(self, feature_dir, feature_name): fname = os.path.join(feature_dir, feature_name + config.FEAT_FILE_SUFFIX) return table_utils._read(fname)