Example #1
0
def main():
	df = table_utils._read(config.CLICK_DATA) \
		.join(table_utils._read(config.RELEVANCE_DATA).set_index(['norm_query', 'hit_title']), on=['norm_query', 'hit_title'], how='inner')

	dfQ = df['norm_query'].drop_duplicates()
	queries = dfQ.reindex(np.random.permutation(dfQ.index))[:20]
	del dfQ

	condition = np.zeros((len(df)), dtype=bool)
	for q in queries:
		condition = condition | (df['norm_query'] == q)
	dfShort = df[condition]

	pd.set_option('display.width', 1000)
	for norm_query, query_group in dfShort.groupby(['norm_query']):
		hits = []
		for title, hit_group in query_group.groupby(['hit_title']):
			num_clicks = np.sum(hit_group['clicked'])
			avg_position = np.mean(hit_group['hit_position'])
			relevance = np.mean(hit_group['relevance'])
			hits.append((title, num_clicks, avg_position, relevance))
                queries = list(query_group['query'].drop_duplicates())
		print("Normalized Query: %s" % (norm_query.encode('utf8')))
                print("Queries:\n\t%s" % ("\n\t".join(map(lambda x: x.encode('utf8'), queries))))
		hitDf = pd.DataFrame(hits, columns=['title', 'num_clicks', 'avg_pos', 'dbn_rel'])
		print(hitDf.sort_values(['dbn_rel'], ascending=False))
		print("\n\n")
Example #2
0
def main():
    # uses inner to ensure we have complete rows. Otherwise
    # we would have NaN mixed in with the relevance scores,
    # and we can't train on those
    dfClicks = table_utils._read(config.CLICK_DATA)
    dfRel = table_utils._read(config.RELEVANCE_DATA)
    if config.REMOVE_WALL:
        dfRel = dfRel[(dfRel['relevance'] < 0.24) | (dfRel['relevance'] > 0.25)]

    dfAll = dfClicks \
            .join(dfRel.set_index(['norm_query', 'hit_title']),
                  on=['norm_query', 'hit_title'], how='inner')
    dfClicks_len = len(dfClicks)
    rows_w_rel = len(dfAll)
    del dfClicks
    del dfRel

    # Filter out pages that couldn't be loaded as docs/termvecs
    es_docs_keys = set(map(int, shelve_keys(config.ES_PAGE_DOCS_SHELVE)))
    dfAll = dfAll[dfAll['hit_page_id'].isin(es_docs_keys)]

    es_termvec_keys = set(map(int, shelve_keys(config.ES_PAGE_TERM_VEC_SHELVE)))
    dfAll = dfAll[dfAll['hit_page_id'].isin(es_termvec_keys)]

    # drop some unnecessary columns
    dfAll.drop(['session_id', 'clicked'], axis=1, inplace=True)

    # average out hit_position and hit_score to improve de-duplication.
    # on 10k queries this is a 90% reduction in data
    dfAll = dfAll.join(dfAll.groupby(['query', 'hit_page_id'])['hit_position'].mean(), on=['query', 'hit_page_id'], rsuffix='_mean')
    dfAll = dfAll.join(dfAll.groupby(['query', 'hit_page_id'])['hit_score'].mean(), on=['query', 'hit_page_id'], rsuffix='_mean')
    dfAll.drop(['hit_position', 'hit_score'], axis=1, inplace=True)

    # turn duplicates into a 'weight' column
    dfAll = dfAll.groupby(map(str, dfAll.columns)).size().reset_index()
    dfAll.rename(columns={0: 'weight'}, inplace=True)

    # Assign id's to all normalized queries, to be used to generate pairwise
    # preferences for the ranking learners
    norm_query_id_df = pd.DataFrame(list(enumerate(dfAll["norm_query"].unique())),
                                    columns=["norm_query_id", "norm_query"])
    dfAll = dfAll.join(norm_query_id_df.set_index("norm_query"), on="norm_query")
    # xgboost requires the data to be in groups by id
    dfAll.sort_values("norm_query_id", inplace=True)

    table_utils._write(config.ALL_DATA, dfAll)

    dfInfo = dfAll[["relevance", "weight", "norm_query_id"]].copy()
    table_utils._write(config.INFO_DATA, dfInfo)

    print 'Source clicks len: %d' % (dfClicks_len)
    print 'Rows with relevance: %d' % (rows_w_rel)
    print 'Final data len: %d' % (len(dfAll))
    print 'Ratio: %.3f' % (float(len(dfAll))/dfClicks_len)
Example #3
0
def main():
            #.join(table_utils._read(config.ES_PAGE_TERM_VEC), on='hit_page_id', how='left') \
            #.join(table_utils._read(config.ES_QUERY_TERM_VEC), on='query', how='left') \
    # This ends up also including the weight of many sessions against same (query, hit_title)
    # pairs in the correlation. Is that desirable?
    dfAll = table_utils._read(config.CLICK_DATA) \
            .join(table_utils._read(config.ES_PAGE_DOCS), on='hit_page_id', how='left') \
            .join(table_utils._read(config.RELEVANCE_DATA).set_index(['norm_query', 'hit_title']), on=['norm_query', 'hit_title'], how='left')

    #dfAll = dfAll[~dfAll['incoming_links'].isnull()]
    # Single value fields
    single_funcs = {
        'id(%s)': lambda x: x,
        'log(%s)': np.log,
        'log2(%s)': np.log2,
        'log1p(%s)': np.log1p,
        'log10(%s)': np.log10,
        'sqrt(%s)': np.sqrt,
        'square(%s)': np.square,
        'exp(%s)': np.exp,
        'expm1(%s)': np.expm1,
        'log(%s+2)': lambda x: np.log(x + 2),
        'log(%s+1e-7)': lambda x: np.log(x + 1e-7),
        'log(1-%s)': lambda x: np.log(1 - x),
    }

    corr_name = 'pearsonr'
    def corr_func(x, y):
        return stats.pearsonr(x, y)[0]

    pairs = []
    dfAll = dfAll[~dfAll['relevance'].isnull()]

    # Correlations are much stronger when removing the "wall" at .24 <= x <= .25
    # dfAll = dfAll[(dfAll['relevance'] > 0.25) | (dfAll['relevance'] < 0.24)]
    for field in ['hit_incoming_links', 'hit_popularity_score', 'hit_text_bytes']:
        for x_fname, x_func in single_funcs.iteritems():
            dfClean = dfAll[~dfAll[field].isnull()]
            xt = dfClean[field].apply(x_func).values
            for y_fname, y_func in single_funcs.iteritems():
                yt = dfClean["relevance"].apply(y_func).values
                corr = corr_func(xt.flatten(), yt)
                if not np.isnan(corr):
                    pairs.append((abs(corr), "%s(%s, %s): %.6f" % (corr_name, x_fname % field, y_fname % 'relevance', corr)))
                print(".", end="")

    print("\n")
    for _, s in sorted(pairs, key=lambda tup: tup[0]):
        print(s)
Example #4
0
def main():
    logname = "generate_feature_ident.log"
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    dfAll = table_utils._read(config.ALL_DATA)

    # Copies of data from ES docs. Note that multi-valued fields are first
    # converted into their length
    obs_fields = ["incoming_links", "popularity_score", "text_bytes",
            "category", "template", "heading", "outgoing_link", "external_link",
            "redirect.title", "auxiliary_text"]
    transforms = [None, np.log, np.log10, np.sqrt]
    dedup = True
    for transform in transforms:
        param_list = [transform]
        sf = StandaloneFeatureWrapper(Ident, dfAll, obs_fields, param_list, config.FEAT_DIR, logger, dedup)
        sf.go()

    # Sub-fields from termvec data
    obs_fields = [x + '_termvec' for x in config.ES_TERM_FIELDS]
    obs_fields += ['query_' + x + '_termvec' for x in config.ES_TERM_FIELDS]
    obs_fields += ['norm_query_' + x + '_termvec' for x in config.ES_TERM_FIELDS]
    es_fields = ['score', 'term_freq', 'ttf', 'doc_freq']
    aggregation_mode = ["mean", "std", "max", "min", "median"]
    for es_field in es_fields:
        for transform in transforms:
            param_list = [es_field, transform, aggregation_mode]
            sf = StandaloneFeatureWrapper(SubFieldIdent, dfAll, obs_fields, param_list, config.FEAT_DIR, logger, dedup)
            sf.go()
Example #5
0
def run_ngram_jaccard():
    logname = "generate_feature_ngram_jaccard.log"
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    dfAll = table_utils._read(config.ALL_DATA)

    generators = [JaccardCoef_Ngram, DiceDistance_Ngram]
    # single valued fields
    obs_fields_list = [["query", "norm_query"]]
    target_fields_list = [["hit_title", "opening_text" ]]
    ngrams = [1,2,3,12,123][:3]
    dedup = True
    for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
        for generator in generators:
            for ngram in ngrams:
                param_list = [ngram]
                pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger, dedup)
                pf.go()

    # multi-valued fields
    target_fields_list = [["category", "template", "heading",
            "outgoing_link", "external_link", "redirect.title",
            "auxiliary_text"]]
    aggregation_mode = ["mean", "std", "max", "min", "median"]
    for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
        for generator in generators:
            multi_gen = MultiTargetEstimatorWrapper(generator)
            for ngram in ngrams:
                param_list = [ngram, aggregation_mode]
                pf = PairwiseFeatureWrapper(multi_gen, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger, dedup)
                pf.go()
Example #6
0
def main():
    logname = "generate_feature_basic.log"
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    dfAll = table_utils._read(config.ALL_DATA)

    # basic
    generators = [DocLen, DocFreq, DocEntropy, DigitCount, DigitRatio]
    obs_fields = ["query", "norm_query", "hit_title", 'opening_text']
    for generator in generators:
        param_list = []
        dedup = False if generator == DocFreq else True
        sf = StandaloneFeatureWrapper(generator, dfAll, obs_fields, param_list,
                                      config.FEAT_DIR, logger, dedup)
        sf.go()

    # basic against multi-value fields
    obs_fields = [
        'category', 'template', 'heading', 'outgoing_link', 'external_link',
        'redirect.title', 'auxiliary_text'
    ]
    aggregations = ['mean', 'std', 'max', 'min', 'median']
    param_list = [aggregations]
    for generator in generators:
        multi_gen = MultiObjEstimatorWrapper(generator)
        dedup = False if generator == DocFreq else True
        sf = StandaloneFeatureWrapper(multi_gen, dfAll, obs_fields, param_list,
                                      config.FEAT_DIR, logger, dedup)
        sf.go()

    # unique count
    generators = [UniqueCount_Ngram, UniqueRatio_Ngram]
    obs_fields = ["query", "norm_query", "hit_title", 'opening_text']
    ngrams = [1, 2, 3]
    for generator in generators:
        for ngram in ngrams:
            param_list = [ngram]
            dedup = True
            sf = StandaloneFeatureWrapper(generator, dfAll, obs_fields,
                                          param_list, config.FEAT_DIR, logger,
                                          dedup)
            sf.go()

    # unique count against multi-value fields
    generators = [UniqueCount_Ngram, UniqueRatio_Ngram]
    obs_fields = [
        'category', 'template', 'heading', 'outgoing_link', 'external_link',
        'redirect.title', 'auxiliary_text'
    ]
    aggregations = ['mean', 'std', 'max', 'min', 'median']
    ngrams = [1, 2, 3]
    for generator in generators:
        for ngram in ngrams:
            multi_gen = MultiObjEstimatorWrapper(generator)
            param_list = [ngram, aggregations]
            dedup = True
            sf = StandaloneFeatureWrapper(multi_gen, dfAll, obs_fields,
                                          param_list, config.FEAT_DIR, logger,
                                          dedup)
            sf.go()
Example #7
0
def main():
    queries = table_utils._read(config.CLICK_DATA)[['query', 'norm_query']] \
            .stack().drop_duplicates().reset_index(drop=True).values

    url = config.ES_URL + '/page/_mtermvectors'
    session = requests.Session()
    reqs = []
    for batch in np_utils._split(queries, 100):
        docs = []
        for query in batch:
            doc = {
                'doc':
                dict(
                    zip(config.ES_TERM_FIELDS,
                        [query] * len(config.ES_TERM_FIELDS))),
                'fields':
                config.ES_TERM_FIELDS,
                'positions':
                False,
                'offsets':
                False,
                'term_statistics':
                True,
                'filter': {
                    'max_num_terms': 50
                }
            }
            # Custom munging for the redirect.title field
            del doc['doc']['redirect.title']
            doc['doc']['redirect'] = [{'namespace': 0, 'title': query}]
            docs.append(doc)
        # for *extra* funsies, elasticsearch isn't going to give us back the original
        # doc that we searched for, although it could be reconstructed from the tokens
        # under most circumstances.
        reqs.append(
            grequests.post(url,
                           data=json.dumps({'docs': docs}),
                           session=session,
                           callback=batch_parse_termvec(batch)))

    data = table_utils._open_shelve_write(config.ES_QUERY_TERM_VEC_SHELVE)
    i = 0
    try:
        with progressbar.ProgressBar(max_value=len(queries)) as bar:
            for r in grequests.imap(reqs,
                                    size=20,
                                    exception_handler=exception_handler):
                for query, vecs in r.data.iteritems():
                    # Can't directly use unicode strings until 3.x, only strings
                    data[query.encode('utf8')] = vecs
                i += len(r.data)
                bar.update(i)
    finally:
        data.close()
Example #8
0
def main():
    # returns an ndarray
    page_ids = table_utils._read(config.CLICK_DATA)['hit_page_id'].unique()

    url = config.ES_URL + '/page/_mtermvectors'
    # shelve is a disk-backed dict, it's a good bit slower than pandas in-memory
    # implementation, but one large sets there is just too much data to hold in memory
    docs = table_utils._open_shelve_write(config.ES_PAGE_TERM_VEC_SHELVE)
    i = 0
    try:
        with progressbar.ProgressBar(max_value=len(page_ids)) as bar:
            # doing it all in one grequests.imap seems to hold onto a ton
            # of extra memory. On a sample of ~200k page id's i was seeing
            # 14G+, but breaking into smaller pieces brought it down to 7G
            for top_batch in np_utils._split(page_ids, 10000):
                session = requests.Session()
                reqs = []
                for batch in np_utils._split(top_batch, 100):
                    data = {
                        'ids': list(batch),
                        'parameters': {
                            'fields': config.ES_TERM_FIELDS,
                            'positions': False,
                            'offsets': False,
                            'term_statistics': True,
                            'filter': {
                                'max_num_terms': config.TERM_VEC_MAX_NUM_TERMS,
                            }
                        }
                    }
                    reqs.append(
                        grequests.post(url,
                                       data=json.dumps(data),
                                       session=session))

                for r in grequests.imap(reqs,
                                        size=20,
                                        exception_handler=exception_handler):
                    found = json.loads(r.text)['docs']
                    for d in json.loads(r.text)['docs']:
                        # TODO: Store empty doc, or handle when reading?
                        if d['found']:
                            docs[str(d['_id'])] = es_utils.parse_termvec(d)
                    i += len(found)
                    bar.update(i)
                # sync every 10k docs frees up memory
                docs.sync()
    finally:
        docs.close()
Example #9
0
def main():
    # returns an ndarray
    page_ids = table_utils._read(config.CLICK_DATA)['hit_page_id'].unique()

    url = config.ES_URL + '/page/_mget'
    params = {'fields': ','.join(config.ES_DOC_FIELDS)}

    defaults = config.ES_DOC_FIELDS_DEFAULTS
    multi_value_fields = [
        k for k, v in defaults.iteritems() if isinstance(v, tuple)
    ]

    docs = table_utils._open_shelve_write(config.ES_PAGE_DOCS_SHELVE)
    i = 0
    try:
        with progressbar.ProgressBar(max_value=len(page_ids)) as bar:
            for top_batch in np_utils._split(page_ids, 10000):
                session = requests.Session()
                reqs = []
                for batch in np_utils._split(top_batch, 100):
                    data = json.dumps({'ids': list(batch)})
                    reqs.append(
                        grequests.post(url,
                                       data=data,
                                       params=params,
                                       session=session))
                for r in grequests.imap(reqs,
                                        size=20,
                                        exception_handler=exception_handler):
                    found = json.loads(r.text)['docs']
                    for d in found:
                        if not d['found']:
                            continue
                        res = defaults.copy()
                        for field, v in d['fields'].iteritems():
                            # ES alwards returns a list, even if there is only one item.
                            # Flatten down single valued fields
                            res[field] = tuple(
                                v) if field in multi_value_fields else v[0]
                        docs[str(d['_id'])] = res
                    i += len(found)
                    bar.update(i)
                docs.sync()
    finally:
        docs.close()
Example #10
0
def main():
    logname = "generate_feature_tfidf.log"
    logger = logging_utils._get_logger(config.LOG_DIR, logname)

    obs_fields = ['query', 'norm_query']
    target_fields = [x + '_termvec' for x in config.ES_TERM_FIELDS]

    dfAll = table_utils._read(config.ALL_DATA)
    docs = table_utils._open_shelve_read(config.ES_PAGE_TERM_VEC_SHELVE)
    queries = table_utils._open_shelve_read(config.ES_QUERY_TERM_VEC_SHELVE)

    generators = [ES_TFIDF_Unigram_TopN_CosineSim]
    dedup = True
    for generator in generators:
        for target_field in target_fields:
            obs_fields_tv = [x + '_' + target_field for x in obs_fields]
            param_list = []
            # TODO: why iterate obs_fields instead of passing all at once?
            pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields_tv,
                                        [target_field], param_list,
                                        config.FEAT_DIR, logger, dedup)
            pf.go()
Example #11
0
def run_edit_distance():
    logname = "generate_feature_edit_distance.log"
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    dfAll = table_utils._read(config.ALL_DATA)

    # single value targets
    obs_fields_list = [["query", "norm_query"]]
    target_fields_list = [["hit_title", "opening_text"]]
    dedup = True
    for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
        param_list = []
        pf = PairwiseFeatureWrapper(EditDistance, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger, dedup)
        pf.go()

    # multi-value targets
    target_fields_list = [["category", "template", "heading",
            "outgoing_link", "external_link", "redirect.title",
            "auxiliary_text"]]
    aggregation_mode = ["mean", "std", "max", "min", "median"]
    for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
        param_list = [aggregation_mode]
        multi_gen = MultiTargetEstimatorWrapper(EditDistance)
        pf = PairwiseFeatureWrapper(multi_gen, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger, dedup)
        pf.go()
Example #12
0
import pandas as pd
import feather
import os
import timeit

import config
from utils import table_utils

df = table_utils._read(config.ALL_DATA)

FILE_HDF = os.path.join(config.TMP_DIR, 'test.h5')
FILE_PICKLE = os.path.join(config.TMP_DIR, 'test.pkl')
FILE_FEATHER = os.path.join(config.TMP_DIR, 'test.feather')

def test_hdf_write():
    df.to_hdf(FILE_HDF, 'test', mode='w')

def test_hdf_read():
    pd.read_hdf(FILE_HDF, 'test')

def test_pickle_write():
    df.to_pickle(FILE_PICKLE)

def test_pickle_read():
    pd.read_pickle(FILE_PICKLE)

def test_feather_write():
    feather.write_dataframe(df.copy(), FILE_FEATHER)

def test_feather_read():
    feather.read_dataframe(FILE_FEATHER)
Example #13
0
 def _load_data_dict(self):
     fname = os.path.join(config.FEAT_DIR+"/Combine", self.feature_name+config.FEAT_FILE_SUFFIX)
     data_dict = table_utils._read(fname)
     return data_dict
Example #14
0
def main():
    df = table_utils._read(config.CLICK_DATA)
    # TODO: using only session_id might be more efficient, but (query, session_id)
    # is more obvious to debug
    grouped = df.groupby(['norm_query', 'session_id'])
    clickmodel_data_file = os.path.join(config.TMP_DIR, 'clickmodel.txt')
    pos = 0
    with codecs.open(clickmodel_data_file, 'w', 'utf8') as f:
        with progressbar.ProgressBar() as bar:
            for (norm_query, session_id), group in bar(grouped):
                assert "\t" not in norm_query
                assert type(norm_query) == unicode
                results = []
                clicks = []
                # TODO: groupby still necessary? check all group lengths, they might be 1
                # after adjustments to the source hive query
                for title, rows in group \
                        .sort_values(['hit_score'], ascending=False) \
                        .groupby(['hit_title'], sort=False):
                    # TODO: using page id instead of title might be more efficient,
                    # but title is easier for debugging
                    results.append(title)
                    clicks.append(True in list(rows['clicked']))
                    if len(results) >= config.DBN_CONFIG['MAX_DOCS_PER_QUERY']:
                        break
                # exclude groups with no clicks
                if not any(clicks):
                    continue
                # exclude too small result sets as well
                if len(results) < config.DBN_CONFIG['MIN_DOCS_PER_QUERY']:
                    continue
                f.write("\t".join([
                    str(pos),  # hash digest
                    norm_query,  # query
                    '0',  # region
                    '0',  # intent weight
                    json.dumps(results),  # urls
                    json.dumps([False] * len(results)),  # layout
                    json.dumps(clicks)  # clicks
                ]) + "\n")
                pos += 1

    del df
    with codecs.open(clickmodel_data_file, 'r', 'utf8') as f:
        reader = InputReader(config.DBN_CONFIG['MIN_DOCS_PER_QUERY'],
                             config.DBN_CONFIG['MAX_DOCS_PER_QUERY'],
                             False,
                             config.DBN_CONFIG['SERP_SIZE'],
                             False,
                             discard_no_clicks=True)
        sessions = reader(f)
        dbn_config = config.DBN_CONFIG.copy()
        dbn_config['MAX_QUERY_ID'] = reader.current_query_id + 1
        model = DbnModel((0.9, 0.9, 0.9, 0.9), config=dbn_config)
        model.train(sessions)

        f.seek(0)
        results = []
        # This is a bit ugly and hackish ... but trying to not explode memory
        # by flipping the giant url_to_id and query_to_id dicts.
        seen = set()
        # hax
        with progressbar.ProgressBar(max_value=pos) as bar:
            pos = 0
            for line in f:
                bar.update(pos)
                pos += 1

                _, norm_query, _, _, titles, _, clicks = line.rstrip().split(
                    '\t')
                titles = json.loads(titles)
                if len(titles) < dbn_config['MIN_DOCS_PER_QUERY']:
                    continue
                query_id = reader.query_to_id[(norm_query, "0")]
                title_ids = [reader.url_to_id[t] for t in titles]
                session = SessionItem(0, query_id, title_ids, 0, [], {})
                relevances = model.get_model_relevances(session)
                for title, relevance in zip(titles, relevances):
                    if (norm_query, title) in seen:
                        continue
                    results.append([norm_query, title, relevance])
                    # alternatly could use drop_duplicates, not sure which
                    # is cheaper on memory usage
                    seen.add((norm_query, title))
        df = pd.DataFrame(results,
                          columns=['norm_query', 'hit_title', 'relevance'])
        print 'Hits with relevance: %d' % (len(results))
        table_utils._write(config.RELEVANCE_DATA, df)
Example #15
0
    def combine(self):
        dfAll = table_utils._read(config.INFO_DATA)
        dfAll_raw = dfAll.copy()
        y = dfAll['relevance'].values

        feat_cnt = 0
        self.logger.info('Run for basic...')
        for file_name in sorted(os.listdir(config.FEAT_DIR)):
            if not config.FEAT_FILE_SUFFIX in file_name:
                continue
            fname = os.path.splitext(file_name)[0]
            if fname not in self.feature_dict:
                continue
            x = self.load_feature(config.FEAT_DIR, fname)
            x = np.nan_to_num(x)
            # Still necessary?
            if np.isnan(x).any():
                self.logger.info("%s nan" % (fname))
                continue
            # Apply feature transformers (?)
            mandatory = self.feature_dict[fname][0]
            transformer = self.feature_dict[fname][1]
            x = transformer.fit_transform(x)
            dim = np_utils._dim(x)
            if dim == 1:
                corr = np_utils._corr(x, y)
                if not mandatory and (np.isnan(corr)
                                      or abs(corr) < self.corr_threshold):
                    self.logger.info(
                        "Drop: {} ({}D) (abs_corr = {}, < threshold {})".
                        format(fname, dim, abs(corr), self.corr_threshold))
                    continue
                dfAll[fname] = x
                self.feature_names.append(fname)
            else:
                columns = ["%s_%d" % (fname, x) for x in range(dim)]
                df = pd.DataFrame(x, columns=columns)
                dfAll = pd.concat([dfAll, df], axis=1)
                self.feature_names.extend(columns)
            feat_cnt += 1
            self.feature_names_basic.append(fname)
            if dim == 1:
                self.logger.info(
                    "Combine {:>3}/{:>3} feat: {} ({}D) (corr = {})".format(
                        feat_cnt, len(self.feature_dict.keys()), fname, dim,
                        corr))
            else:
                self.logger.info("Combine {:>3}/{:>3} feat: {} ({}D)".format(
                    feat_cnt, len(self.feature_dict.keys()), fname, dim))

        dfAll.fillna(config.MISSING_VALUE_NUMERIC, inplace=True)
        self.y = dfAll["relevance"].values.astype(float)
        self.weights = dfAll['weight'].values
        self.query_ids = dfAll['norm_query_id'].values
        dfAll.drop(["relevance", "weight", "norm_query_id"],
                   axis=1,
                   inplace=True)
        self.X = dfAll.values.astype(float)

        self.logger.info("Overall Shape: %d x %d" %
                         (len(self.y), self.X.shape[1]))
        self.logger.info("Done combining")
Example #16
0
 def load_feature(self, feature_dir, feature_name):
     fname = os.path.join(feature_dir,
                          feature_name + config.FEAT_FILE_SUFFIX)
     return table_utils._read(fname)