def main():
    logname = "generate_feature_group_relevance_%s.log"%time_utils._timestamp()
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED)
    dfTrain = dfAll.iloc[:TRAIN_SIZE].copy()

    ## run python3 splitter.py first
    split = pkl_utils._load("%s/splits_level1.pkl"%config.SPLIT_DIR)
    n_iter = len(split)

    ## for cv
    for i in range(n_iter):
        trainInd, validInd = split[i][0], split[i][1]
        dfTrain2 = dfTrain.iloc[trainInd].copy()
        sub_feature_dir = "%s/Run%d" % (config.FEAT_DIR, i+1)

        obs_fields = ["search_term", "product_title"][1:]
        aggregation_mode = ["mean", "std", "max", "min", "median", "size"]
        param_list = [dfAll["id"], dfTrain2, aggregation_mode]
        sf = StandaloneFeatureWrapper(GroupRelevance, dfAll, obs_fields, param_list, sub_feature_dir, logger)
        sf.go()

    ## for all
    sub_feature_dir = "%s/All" % (config.FEAT_DIR)
    obs_fields = ["search_term", "product_title"][1:]
    aggregation_mode = ["mean", "std", "max", "min", "median", "size"]
    param_list = [dfAll["id"], dfTrain, aggregation_mode]
    sf = StandaloneFeatureWrapper(GroupRelevance, dfAll, obs_fields, param_list, sub_feature_dir, logger)
    sf.go()
def process(lang, pivot):
	print "[%s]: process for language %s" % (time_utils._timestamp(), lang)
	linkDict = pkl_utils._load(config.ILL_DICT["%s2%s" % (lang, pivot)])
	templateDict = pkl_utils._load(config.TEMPLATE2ARTICLE[lang])
	articleDict = pkl_utils._load(config.ARTICLE2TEMPLATE[pivot])
	mapping = pd.read_csv(config.EXISTING_MAPPING_OUTPUT[pivot], index_col="template")
	template1 = []; template2 = []
	article1 = []; article2 = []; ontology = []
	for template in templateDict:
		articles = templateDict[template]
		for article in articles:
			if article in linkDict:
				tmp = linkDict[article]
				template1.append(template)
				article1.append(article)
				article2.append(tmp)
				if tmp in articleDict:
					templateList = articleDict[tmp]
				else:
					templateList = []
				c = ""
				t = ""
				for Template in templateList:
					if Template in mapping.index:
						c = mapping.at[Template, "ontology"]
						t = Template
				template2.append(t)
				ontology.append(c)

	data = {"template1":template1, "article1":article1, "template2":template2, \
			"article2":article2, "ontology":ontology}
	df = pd.DataFrame(data)
	df.to_csv(config.ENTITY_MATRIX["%s2%s" % (lang, pivot)], index=False)
	print "[%s]: processing complete" % time_utils._timestamp()
def main(options):
	lang = options.lang
	p = options.parse
	t = options.train
	ncomp = options.ncomp
	me = options.me
	fin = options.fin
	fout = options.fout

	if p:
		parse(lang)
	if t:
		cmd = "python run_hole.py --fin %s --fout %s --test-all 50 --nb 100 --me %d \
			--margin 0.2 --lr 0.1 --ncomp %d" % (lang, config.HOLE_OUTPUT[lang], me, ncomp)
		os.system(cmd)
	
	hole = pkl_utils._load(config.HOLE_OUTPUT[lang])
	data_dict = pkl_utils._load(config.DATA_DICT[lang])
	model = hole["model"]
	entityDict = { y:x for x, y in enumerate(data_dict["entities"])}
	predicateDict = { y:x for x, y in enumerate(data_dict["relations"])}
	df = pd.read_csv(fin, names=["s", "p", "o"])
	df["s"] = df["s"].map(entityDict)
	df["p"] = df["p"].map(predicateDict)
	df["o"] = df["o"].map(entityDict)
	scores = model._scores(list(df["s"]), list(df["p"]), list(df["o"]))
	pd.DataFrame(scores).to_csv(fout, index=False, header=False)
Exemple #4
0
def main():
    
    dfTrain = pd.read_csv(config.TRAIN_DATA, encoding="ISO-8859-1")
    dfTest = pd.read_csv(config.TEST_DATA, encoding="ISO-8859-1")


    # splits for level1
    splitter = HomedepotSplitter(dfTrain=dfTrain, 
                                dfTest=dfTest, 
                                n_iter=config.N_RUNS, 
                                random_state=config.RANDOM_SEED, 
                                verbose=True,
                                plot=True,
                                # tune these params to get a close distribution
                                split_param=[0.5, 0.25, 0.5],
                                )
    splitter.split()
    splitter.save("%s/splits_level1.pkl"%config.SPLIT_DIR)
    splits_level1 = splitter.splits


    ## splits for level2
    splits_level1 = pkl_utils._load("%s/splits_level1.pkl"%config.SPLIT_DIR)
    splits_level2 = [0]*config.N_RUNS
    for run, (trainInd, validInd) in enumerate(splits_level1):
        dfValid = dfTrain.iloc[validInd].copy()
        splitter2 = HomedepotSplitter(dfTrain=dfValid, 
                                    dfTest=dfTest, 
                                    n_iter=1, 
                                    random_state=run, 
                                    verbose=True,
                                    # tune these params to get a close distribution
                                    split_param=[0.5, 0.15, 0.6])
        splitter2.split()
        splits_level2[run] = splitter2.splits[0]
    pkl_utils._save("%s/splits_level2.pkl"%config.SPLIT_DIR, splits_level2)


    ## splits for level3
    splits_level2 = pkl_utils._load("%s/splits_level2.pkl"%config.SPLIT_DIR)
    splits_level3 = [0]*config.N_RUNS
    for run, (trainInd, validInd) in enumerate(splits_level2):
        dfValid = dfTrain.iloc[validInd].copy()
        splitter3 = HomedepotSplitter(dfTrain=dfValid, 
                                    dfTest=dfTest, 
                                    n_iter=1, 
                                    random_state=run, 
                                    verbose=True,
                                    # tune these params to get a close distribution
                                    split_param=[0.5, 0.15, 0.7])
        splitter3.split()
        splits_level3[run] = splitter3.splits[0]
    pkl_utils._save("%s/splits_level3.pkl"%config.SPLIT_DIR, splits_level3)
def run_tsne_lsa_ngram():
    logname = "generate_feature_tsne_lsa_ngram_%s.log"%time_utils._timestamp()
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED)
    dfAll.drop(["product_attribute_list"], inplace=True, axis=1)

    generators = [TSNE_LSA_Word_Ngram, TSNE_LSA_Char_Ngram]
    ngrams_list = [[1,2,3], [2,3,4,5]]
    ngrams_list = [[1,2,3], [4]]
    obs_fields = ["search_term", "search_term_alt", "search_term_auto_corrected", "product_title", "product_description"]
    for generator,ngrams in zip(generators, ngrams_list):
        for ngram in ngrams:
            param_list = [ngram, config.SVD_DIM, config.SVD_N_ITER]
            sf = StandaloneFeatureWrapper(generator, dfAll, obs_fields, param_list, config.FEAT_DIR, logger, force_corr=True)
            sf.go()

    generators = [TSNE_LSA_Word_Ngram_Pair]
    ngrams = [1, 2]
    obs_fields_list = []
    target_fields_list = []
    obs_fields_list.append( ["search_term", "search_term_alt", "search_term_auto_corrected"] )
    target_fields_list.append( ["product_title", "product_description"] )
    for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
        for ngram in ngrams:
            for generator in generators:
                param_list = [ngram, config.SVD_DIM, config.SVD_N_ITER]
                pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger, force_corr=True)
                pf.go()
def main():
    logname = "generate_feature_intersect_position_%s.log"%time_utils._timestamp()
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED)

    generators = [
        IntersectPosition_Ngram, 
        IntersectNormPosition_Ngram, 
    ]
    obs_fields_list = []
    target_fields_list = []
    ## query in document
    obs_fields_list.append( ["search_term", "search_term_product_name", "search_term_alt", "search_term_auto_corrected"] )
    target_fields_list.append( ["product_title", "product_title_product_name", "product_description", "product_attribute", "product_brand", "product_color"][1:2] )
    ## document in query
    obs_fields_list.append( ["product_title", "product_title_product_name", "product_description", "product_attribute", "product_brand", "product_color"][1:2] )
    target_fields_list.append( ["search_term", "search_term_product_name", "search_term_alt", "search_term_auto_corrected"] )
    ngrams = [1,2,3,12,123][:3]
    aggregation_mode = ["mean", "std", "max", "min", "median"]
    for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
        for generator in generators:
            for ngram in ngrams:
                param_list = [ngram, aggregation_mode]
                pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger)
                pf.go()
def run_count():
    logname = "generate_feature_first_last_ngram_count_%s.log"%time_utils._timestamp()
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED)

    generators = [
        FirstIntersectCount_Ngram, 
        LastIntersectCount_Ngram, 
        FirstIntersectRatio_Ngram, 
        LastIntersectRatio_Ngram, 
    ]

    obs_fields_list = []
    target_fields_list = []
    ## query in document
    obs_fields_list.append( ["search_term", "search_term_product_name", "search_term_alt", "search_term_auto_corrected"] )
    target_fields_list.append( ["product_title", "product_title_product_name", "product_description", "product_attribute", "product_brand", "product_color"] )
    ## document in query
    obs_fields_list.append( ["product_title", "product_title_product_name", "product_description", "product_attribute", "product_brand", "product_color"] )
    target_fields_list.append( ["search_term", "search_term_product_name", "search_term_alt", "search_term_auto_corrected"] )
    ngrams = [1,2,3,12,123][:3]
    for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
        for generator in generators:
            for ngram in ngrams:
                param_list = [ngram]
                pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger)
                pf.go()
def main():
    logname = "generate_feature_doc2vec_%s.log"%time_utils._timestamp()
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    #### NOTE: use data BEFORE STEMMING
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED)

    doc2vec_model_dirs = []
    model_prefixes = []
    ## doc2vec model trained with Homedepot dataset: brand/color/obs/title/description
    doc2vec_model_dirs.append( config.DOC2VEC_MODEL_DIR + "/Homedepot-doc2vec-D%d-min_count%d.model"%(config.EMBEDDING_DIM, config.EMBEDDING_MIN_COUNT) )
    model_prefixes.append( "Homedepot" )
    for doc2vec_model_dir, model_prefix in zip(doc2vec_model_dirs, model_prefixes):
        ## load model
        try:
            if ".bin" in doc2vec_model_dir:
                doc2vec_model = gensim.models.Doc2Vec.load_word2vec_format(doc2vec_model_dir, binary=True)
            if ".txt" in doc2vec_model_dir:
                doc2vec_model = gensim.models.Doc2Vec.load_word2vec_format(doc2vec_model_dir, binary=False)
            else:
                doc2vec_model = gensim.models.Doc2Vec.load(doc2vec_model_dir)
                doc2vec_model_sent_label = pkl_utils._load(doc2vec_model_dir+".sent_label")
        except:
            continue

        # ## standalone (not used in model building)
        # obs_fields = ["search_term", "search_term_alt", "product_title", "product_description", "product_attribute"]
        # generator = Doc2Vec_Vector
        # param_list = [doc2vec_model, doc2vec_model_sent_label, model_prefix]
        # sf = StandaloneFeatureWrapper(generator, dfAll, obs_fields, param_list, config.FEAT_DIR, logger)
        # sf.go()

        ## pairwise
        generators = [
            Doc2Vec_CosineSim, 
            Doc2Vec_RMSE, 
            Doc2Vec_Vdiff,
        ]
        obs_fields_list = []
        target_fields_list = []
        obs_fields_list.append( ["search_term", "search_term_alt"] )
        target_fields_list.append( ["product_title", "product_description", "product_attribute", "product_brand", "product_color"] )
        for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
            for generator in generators:
                param_list = [doc2vec_model, doc2vec_model_sent_label, model_prefix]
                pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger)
                pf.go()
def main():
    logname = "generate_feature_query_quality_%s.log"%time_utils._timestamp()
    logger = logging_utils._get_logger(config.LOG_DIR, logname)

    obs_corpus = []
    query_suffix = []
    # raw
    dfAll = pkl_utils._load(config.ALL_DATA_RAW)
    obs_corpus.append(dfAll["search_term"].values)
    query_suffix.append("raw")
    # after processing    
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED)
    obs_corpus.append(dfAll["search_term"].values)
    query_suffix.append("lemmatized")
    # after extracting product_name in search_term
    obs_corpus.append(dfAll["search_term_product_name"].values)
    query_suffix.append("product_name")
    if "search_term_auto_corrected" in dfAll.columns:
        # after auto correction
        obs_corpus.append(dfAll["search_term_auto_corrected"].values)
        query_suffix.append("corrected")  
    # after stemming
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED)
    obs_corpus.append(dfAll["search_term"].values)
    query_suffix.append("stemmed")

    y_train = dfAll["relevance"].values[:TRAIN_SIZE]
    for i in range(len(query_suffix)-1):
        for j in range(i+1, len(query_suffix)):
            ext = QueryQuality(obs_corpus[i], obs_corpus[j])
            x = ext.transform()
            dim = 1
            fname = "%s_%s_x_%s_%dD"%(ext._get_feat_name(), query_suffix[i], query_suffix[j], dim)
            pkl_utils._save(os.path.join(config.FEAT_DIR, fname+config.FEAT_FILE_SUFFIX), x)
            corr = np_utils._corr(x[:TRAIN_SIZE], y_train)
            logger.info("%s (%dD): corr = %.6f"%(fname, dim, corr))

    # raw
    dfAll = pkl_utils._load(config.ALL_DATA_RAW)
    obs_fields = ["search_term"]
    param_list = []
    sf = StandaloneFeatureWrapper(IsInGoogleDict, dfAll, obs_fields, param_list, config.FEAT_DIR, logger)
    sf.go()
def main(which):
    logname = "generate_feature_stat_cooc_tfidf_%s.log"%time_utils._timestamp()
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED)

    generators = []
    for w in which.split(","):
        if w == "tf":
            generators.append( StatCoocTF_Ngram )
        elif w == "norm_tf":
            generators.append( StatCoocNormTF_Ngram )
        elif w == "tfidf":
            generators.append( StatCoocTFIDF_Ngram )
        elif w == "norm_tfidf":
            generators.append( StatCoocNormTFIDF_Ngram )
        elif w == "bm25":
            generators.append( StatCoocBM25_Ngram )


    obs_fields_list = []
    target_fields_list = []
    ## query in document
    obs_fields_list.append( ["search_term", "search_term_alt", "search_term_auto_corrected"] )
    target_fields_list.append( ["product_title", "product_title_product_name", "product_description", "product_attribute", "product_brand", "product_color"] )
    ## document in query
    obs_fields_list.append( ["product_title", "product_title_product_name", "product_description", "product_attribute", "product_brand", "product_color"] )
    target_fields_list.append( ["search_term", "search_term_alt", "search_term_auto_corrected"] )
    ngrams = [1,2,3,12,123][:3]
    aggregation_mode = ["mean", "std", "max", "min", "median"]
    for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
        for generator in generators:
            for ngram in ngrams:
                param_list = [ngram, aggregation_mode]
                pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger)
                pf.go()


    obs_fields_list = []
    target_fields_list = []
    ## query in document
    obs_fields_list.append( ["search_term_product_name"] )
    target_fields_list.append( ["product_title_product_name"] )
    ngrams = [1,2]
    aggregation_mode = ["mean", "std", "max", "min", "median"]
    for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
        for generator in generators:
            for ngram in ngrams:
                if ngram == 2:
                    # since product_name is of length 2, it makes no difference 
                    # for various aggregation as there is only one item
                    param_list = [ngram, "mean"]
                else:
                    param_list = [ngram, aggregation_mode]
                pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger)
                pf.go()
Exemple #11
0
def factorize(lang="en"):
	X = pkl_utils._load(config.TENSOR[lang])
	entityDict = pkl_utils._load(config.ENTITY[lang])
	typeDict = pkl_utils._load(config.TYPE[lang])
	entry = pkl_utils._load(config.TYPE_MATRIX[lang])
	t2e = {typeDict[t]:entityDict[t] for t in typeDict}
	_log.info("Data has been loaded")
	N, M = X[0].shape[0], len(X)
	_log.info('Datasize: %d x %d x %d' % (N, N, M))

	FOLDS = 5
	IDX = list(range(N))
	shuffle(IDX)
	fsz = int(N/FOLDS)
	offset = 0
	tid = t2e[typeDict["http://dbpedia.org/ontology/Person"]]
	GROUND_TRUTH = X[-1][:, tid]
	AUC = np.zeros(FOLDS)
	for f in range(FOLDS):
		idx = set(IDX[offset:offset+fsz])
		offset += fsz
		_log.info('Fold %d' % f)
		T = [x.copy() for x in X[:-1]]
		rows = []
		cols = []
		data = []
		for x,y in zip(entry[0], entry[1]):
			if (x in idx) and (y == tid):
				continue
			rows.append(x)
			cols.append(y)
			data.append(1)
		T.append(spsp.csr_matrix((data, (rows, cols)), (N, N)))
		_log.info('Construction complete')
		P = predict_rescal_als(T, tid)
		precision, recall, _ = precision_recall_curve(GROUND_TRUTH, P)
		AUC[f] = auc(precision, recall)
		_log.info('AUC: %f' % AUC[f])
	
	_log.info('AUC-PR Test Mean / Std: %f / %f' % (AUC.mean(), AUC.std()))
def run_compression_distance():
    logname = "generate_feature_compression_distance_%s.log"%time_utils._timestamp()
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED)

    obs_fields_list = []
    target_fields_list = []
    obs_fields_list.append( ["search_term", "search_term_product_name", "search_term_alt", "search_term_auto_corrected"] )
    target_fields_list.append( ["product_title", "product_title_product_name", "product_description", "product_attribute", "product_brand", "product_color"] )
    for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
        param_list = []
        pf = PairwiseFeatureWrapper(CompressionDistance, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger)
        pf.go()
def main():
    logname = "generate_feature_basic_%s.log"%time_utils._timestamp()
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED)

    ## basic
    generators = [DocId, DocLen, DocFreq, DocEntropy, DigitCount, DigitRatio]
    obs_fields = ["search_term", "product_title", "product_description", 
                "product_attribute", "product_brand", "product_color"]
    for generator in generators:
        param_list = []
        sf = StandaloneFeatureWrapper(generator, dfAll, obs_fields, param_list, config.FEAT_DIR, logger)
        sf.go()

    ## for product_uid
    generators = [DocIdEcho, DocFreq, ProductUidDummy1, ProductUidDummy2, ProductUidDummy3]
    obs_fields = ["product_uid"]
    for generator in generators:
        param_list = []
        sf = StandaloneFeatureWrapper(generator, dfAll, obs_fields, param_list, config.FEAT_DIR, logger)
        sf.go()

    ## unique count
    generators = [UniqueCount_Ngram, UniqueRatio_Ngram]
    obs_fields = ["search_term", "product_title", "product_description", 
    "product_attribute", "product_brand", "product_color"]
    ngrams = [1,2,3]
    for generator in generators:
        for ngram in ngrams:
            param_list = [ngram]
            sf = StandaloneFeatureWrapper(generator, dfAll, obs_fields, param_list, config.FEAT_DIR, logger)
            sf.go()

    ## for product_attribute_list
    generators = [
        AttrCount, 
        AttrBulletCount, 
        AttrBulletRatio, 
        AttrNonBulletCount, 
        AttrNonBulletRatio,
        AttrHasProductHeight,
        AttrHasProductWidth,
        AttrHasProductLength,
        AttrHasProductDepth,
        AttrHasIndoorOutdoor,
    ]
    obs_fields = ["product_attribute_list"]
    for generator in generators:
        param_list = []
        sf = StandaloneFeatureWrapper(generator, dfAll, obs_fields, param_list, config.FEAT_DIR, logger)
        sf.go()
def main():
    logname = "generate_feature_group_distance_%s.log"%time_utils._timestamp()
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED)
    dfTrain = dfAll.iloc[:TRAIN_SIZE].copy()

    ## run python3 splitter.py first
    split = pkl_utils._load("%s/splits_level1.pkl"%config.SPLIT_DIR)
    n_iter = len(split)

    relevances_complete = [1, 1.25, 1.33, 1.5, 1.67, 1.75, 2, 2.25, 2.33, 2.5, 2.67, 2.75, 3]
    relevances = [1, 1.33, 1.67, 2, 2.33, 2.67, 3]
    ngrams = [1]
    obs_fields = ["search_term"]
    target_fields = ["product_title", "product_description"]
    aggregation_mode = ["mean", "std", "max", "min", "median"]

    ## for cv
    for i in range(n_iter):
        trainInd, validInd = split[i][0], split[i][1]
        dfTrain2 = dfTrain.iloc[trainInd].copy()
        sub_feature_dir = "%s/Run%d" % (config.FEAT_DIR, i+1)

        for target_field in target_fields:
            for relevance in relevances:
                for ngram in ngrams:
                    param_list = [dfAll["id"], dfTrain2, target_field, relevance, ngram, aggregation_mode]
                    pf = PairwiseFeatureWrapper(GroupRelevance_Ngram_Jaccard, dfAll, obs_fields, [target_field], param_list, sub_feature_dir, logger)
                    pf.go()

    ## for all
    sub_feature_dir = "%s/All" % (config.FEAT_DIR)
    for target_field in target_fields:
        for relevance in relevances:
            for ngram in ngrams:
                param_list = [dfAll["id"], dfTrain, target_field, relevance, ngram, aggregation_mode]
                pf = PairwiseFeatureWrapper(GroupRelevance_Ngram_Jaccard, dfAll, obs_fields, [target_field], param_list, sub_feature_dir, logger)
                pf.go()
def main():
    fnames = [
        "LSA100_Word_Unigram_Pair_search_term_x_product_title_100D",
        "LSA100_Word_Bigram_Pair_search_term_x_product_title_100D",
        "LSA100_Word_Obs_Unigram_Target_Unigram_Cooc_search_term_x_product_title_100D",
        "LSA100_Word_Obs_Unigram_Target_Bigram_Cooc_search_term_x_product_title_100D",
    ]

    fnames = [os.path.join(config.FEAT_DIR, fname+".pkl") for fname in fnames]

    for fname in fnames:
        f = pkl_utils._load(fname)
        columns = ["LSA%d"%(i+1) for i in range(f.shape[1])]
        pd.DataFrame(f, columns=columns).to_csv(fname[:-4]+".csv", index=False)
def run_char_dist_sim():
    logname = "generate_feature_char_dist_sim_%s.log"%time_utils._timestamp()
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED)
    dfAll.drop(["product_attribute_list"], inplace=True, axis=1)
    
    generators = [CharDistribution_Ratio, CharDistribution_CosineSim, CharDistribution_KL]
    obs_fields_list = []
    target_fields_list = []
    obs_fields_list.append( ["search_term", "search_term_alt", "search_term_auto_corrected"] )
    target_fields_list.append( ["product_title", "product_description", "product_attribute"] )
    for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
        for generator in generators:
            param_list = []
            pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger)
            pf.go()
def run_ngram_jaccard():
    logname = "generate_feature_ngram_jaccard_%s.log"%time_utils._timestamp()
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED)

    generators = [JaccardCoef_Ngram, DiceDistance_Ngram]
    obs_fields_list = []
    target_fields_list = []
    obs_fields_list.append( ["search_term", "search_term_product_name", "search_term_alt", "search_term_auto_corrected"][:2] )
    target_fields_list.append( ["product_title", "product_title_product_name", "product_description", "product_attribute", "product_brand", "product_color"] )
    ngrams = [1,2,3,12,123][:3]
    for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
        for generator in generators:
            for ngram in ngrams:
                param_list = [ngram]
                pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger)
                pf.go()
def run_tfidf_ngram_cosinesim():
    logname = "generate_feature_tfidf_ngram_cosinesim_%s.log"%time_utils._timestamp()
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED)
    dfAll.drop(["product_attribute_list"], inplace=True, axis=1)

    generators = [TFIDF_Word_Ngram_CosineSim, TFIDF_Char_Ngram_CosineSim]
    ngrams_list = [[1,2,3], [2,3,4,5]]
    ngrams_list = [[1,2,3], [4]]
    obs_fields_list = []
    target_fields_list = []
    obs_fields_list.append( ["search_term", "search_term_alt", "search_term_auto_corrected"] )
    target_fields_list.append( ["product_title", "product_description", "product_attribute"] )
    for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
        for generator,ngrams in zip(generators, ngrams_list):
            for ngram in ngrams:
                param_list = [ngram]
                pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger)
                pf.go()
def run_edit_distance():
    logname = "generate_feature_edit_distance_%s.log"%time_utils._timestamp()
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED)

    obs_fields_list = []
    target_fields_list = []
    obs_fields_list.append( ["search_term", "search_term_product_name", "search_term_alt", "search_term_auto_corrected"][1:2] )
    target_fields_list.append( ["product_title", "product_title_product_name", "product_description", "product_attribute", "product_brand", "product_color"] )
    ngrams = [1,2,3,12,123][:3]
    aggregation_mode_prev = ["mean", "max", "min", "median"]
    aggregation_mode = ["mean", "std", "max", "min", "median"]
    for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
        param_list = []
        PairwiseFeatureWrapper(EditDistance, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger)
        for ngram in ngrams:
            param_list = [ngram, aggregation_mode_prev, aggregation_mode]
            pf = PairwiseFeatureWrapper(EditDistance_Ngram, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger)
            pf.go()
def run_lsa_ngram_cooc():
    logname = "generate_feature_lsa_ngram_cooc_%s.log"%time_utils._timestamp()
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED)
    dfAll.drop(["product_attribute_list"], inplace=True, axis=1)

    generators = [LSA_Word_Ngram_Cooc]
    obs_ngrams = [1, 2]
    target_ngrams = [1, 2]
    obs_fields_list = []
    target_fields_list = []
    obs_fields_list.append( ["search_term", "search_term_alt", "search_term_auto_corrected"] )
    target_fields_list.append( ["product_title", "product_description"] )
    for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
        for obs_ngram in obs_ngrams:
            for target_ngram in target_ngrams:
                for generator in generators:
                    param_list = [obs_ngram, target_ngram, config.SVD_DIM, config.SVD_N_ITER]
                    pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger)
                    pf.go()
def main():
    logname = "generate_feature_wordnet_similarity_%s.log"%time_utils._timestamp()
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    #### NOTE: use data BEFORE STEMMING
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED)

    generators = [
        WordNet_Path_Similarity,
        WordNet_Lch_Similarity,
        WordNet_Wup_Similarity,
    ]
    obs_fields_list = []
    target_fields_list = []
    obs_fields_list.append( ["search_term", "search_term_alt", "search_term_auto_corrected"] )
    target_fields_list.append( ["product_title", "product_description", "product_attribute"] )
    # double aggregation
    aggregation_mode_prev = ["mean", "max", "min", "median"]
    aggregation_mode = ["mean", "std", "max", "min", "median"]
    for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
        for generator in generators:
            param_list = [aggregation_mode_prev, aggregation_mode]
            pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger)
            pf.go()
def main(which):
    logname = "generate_feature_stat_cooc_tfidf_%s_%s.log" % (
        which, time_utils._timestamp())
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED)

    generators = []
    if which == "tf":
        generators.append(StatCoocTF_Ngram)
    elif which == "norm_tf":
        generators.append(StatCoocNormTF_Ngram)
    elif which == "tfidf":
        generators.append(StatCoocTFIDF_Ngram)
    elif which == "norm_tfidf":
        generators.append(StatCoocNormTFIDF_Ngram)
    elif which == "bm25":
        generators.append(StatCoocBM25_Ngram)

    obs_fields_list = []
    target_fields_list = []
    ## query in document
    obs_fields_list.append(["question1"])
    target_fields_list.append(["question2"])
    ## document in query
    obs_fields_list.append(["question2"])
    target_fields_list.append(["question1"])
    ngrams = [1, 2, 3, 12, 123][:3]
    aggregation_mode = ["mean", "std", "max", "min", "median"]
    for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
        for generator in generators:
            for ngram in ngrams:
                param_list = [ngram, aggregation_mode]
                pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields,
                                            target_fields, param_list,
                                            config.FEAT_DIR, logger)
                pf.go()
def run_char_dist_sim():
    logname = "generate_feature_char_dist_sim_%s.log" % time_utils._timestamp()
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED)

    generators = [
        CharDistribution_Ratio, CharDistribution_CosineSim, CharDistribution_KL
    ]
    obs_fields_list = [['question1'], ['question2']]
    target_fields_list = [['question2'], ['question1']]
    for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
        for generator in generators:
            param_list = []
            pf = PairwiseFeatureWrapper(generator,
                                        dfAll,
                                        obs_fields,
                                        target_fields,
                                        param_list,
                                        config.FEAT_DIR,
                                        logger,
                                        force_corr=True)
            pf.go()
            del pf
            gc.collect()
Exemple #24
0
def main():
    logname = "generate_feature_basic_%s.log" % time_utils._timestamp()
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED)

    ## basic
    generators = [DocId, DocLen, DocFreq, DocEntropy, DigitCount, DigitRatio]
    obs_fields = ["question1", "question2"]
    for generator in generators:
        param_list = []
        sf = StandaloneFeatureWrapper(generator, dfAll, obs_fields, param_list,
                                      config.FEAT_DIR, logger)
        sf.go()

    ## unique count
    generators = [UniqueCount_Ngram, UniqueRatio_Ngram]
    obs_fields = ["question1", "question2"]
    ngrams = [1, 2, 3]
    for generator in generators:
        for ngram in ngrams:
            param_list = [ngram]
            sf = StandaloneFeatureWrapper(generator, dfAll, obs_fields,
                                          param_list, config.FEAT_DIR, logger)
            sf.go()
def main():
    logname = "generate_feature_match_%s.log"%time_utils._timestamp()
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED)
    
    generators = [
        MatchQueryCount, 
        MatchQueryRatio, 
        LongestMatchSize,
        LongestMatchRatio
    ]
    obs_fields_list = []
    target_fields_list = []
    obs_fields_list.append( ["search_term", "search_term_product_name", "search_term_alt", "search_term_auto_corrected"] )
    target_fields_list.append( ["product_title", "product_title_product_name", "product_description", "product_attribute", "product_brand", "product_color"] )
    for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
        for generator in generators:
            param_list = []
            pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger)
            pf.go()

    # product_attribute_list
    generators = [
        MatchAttrCount, 
        MatchAttrRatio, 
        IsIndoorOutdoorMatch, 
    ]
    obs_fields_list = []
    target_fields_list = []
    obs_fields_list.append( ["search_term", "search_term_alt", "search_term_auto_corrected"] )
    target_fields_list.append( ["product_attribute_list"] )
    for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
        for generator in generators:
            param_list = []
            pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger)
            pf.go()
Exemple #26
0
    def __init__(self,
                 model_name,
                 data_name,
                 cv_runs,
                 params_dict,
                 logger,
                 portion=100,
                 save_name=''):
        print("Loading data...")
        if portion <= 100:  # all the data, portion% clean + all noisy
            self.portion = '-' + str(portion) if portion != 100 else ''
        else:
            portion /= 100  # only clean data, portion% clean
            self.portion = '-' + str(int(portion)) + '-clean'
        print('run task on: ', self.portion, ' dataset: ', data_name)
        if data_name == "ontonotes":
            words_train, mentions_train, positions_train, labels_train = data_utils.load(
                config.ONTONOTES_TRAIN_CLEAN + self.portion)
            words, mentions, positions, labels = data_utils.load(
                config.ONTONOTES_TEST_CLEAN)
            type2id, typeDict = pkl_utils._load(config.ONTONOTES_TYPE)
            num_types = len(type2id)
            type_info = config.ONTONOTES_TYPE
        elif data_name == "bbn":
            words_train, mentions_train, positions_train, labels_train = data_utils.load(
                config.BBN_TRAIN_CLEAN + self.portion)
            words, mentions, positions, labels = data_utils.load(
                config.BBN_TEST_CLEAN)
            type2id, typeDict = pkl_utils._load(config.BBN_TYPE)
            num_types = len(type2id)
            type_info = config.BBN_TYPE
        else:
            assert False, 'you have to specify the name of dataset with -d (ie. bbn/....)'
        self.model_name = model_name
        self.savename = save_name
        self.data_name = data_name
        self.cv_runs = cv_runs
        self.params_dict = params_dict
        self.hparams = AttrDict(params_dict)
        #self.hparams.alpha=alpha
        self.logger = logger

        self.id2type = {type2id[x]: x for x in type2id.keys()}

        def type2vec(types):  # only terminal will be labeled
            tmp = np.zeros(num_types)
            for t in str(types).split():
                if t in type2id.keys():
                    tmp[type2id[t]] = 1.0
            return tmp

        labels_train = np.array([type2vec(t)
                                 for t in labels_train])  # one hot vec'
        labels = np.array([type2vec(t) for t in labels])

        tempname = self.data_name + config.testemb
        tempname = os.path.join(config.PKL_DIR, tempname)
        if os.path.exists(tempname):
            self.embedding = pickle.load(open(tempname, 'rb'))
            print('embedding load over')
        else:
            self.embedding = embedding_utils.\
             Embedding.fromCorpus(config.EMBEDDING_DATA,list(words_train) + list(words),
                                     config.MAX_DOCUMENT_LENGTH, config.MENTION_SIZE)
            pickle.dump(self.embedding, open(tempname, 'wb'))
            print('embedding dump over')
        self.embedding.max_document_length = config.MAX_DOCUMENT_LENGTH

        print("Preprocessing data...")

        if True:
            textlen_train = np.array([
                self.embedding.len_transform1(x) for x in words_train
            ])  # with cut down len sequence
            words_train = np.array([
                self.embedding.text_transform1(x) for x in words_train
            ])  # with cut down word id sequence and mask with zero <PAD>
            mentionlen_train = np.array([
                self.embedding.len_transform2(x) for x in mentions_train
            ])  # mention len
            mentions_train = np.array([
                self.embedding.text_transform2(x) for x in mentions_train
            ])  # mention text indexer
            positions_train = np.array([
                self.embedding.position_transform(x) for x in positions_train
            ])  # start ,end position
            print('get train data')

            textlen = np.array(
                [self.embedding.len_transform1(x) for x in words])
            words = np.array([
                self.embedding.text_transform1(x) for x in words
            ])  # padding and cut down
            mentionlen = np.array(
                [self.embedding.len_transform2(x) for x in mentions])
            mentions = np.array(
                [self.embedding.text_transform2(x) for x in mentions])
            positions = np.array(
                [self.embedding.position_transform(x) for x in positions])
            print('get test data')
            # pickle.dump([textlen_train, words_train, mentionlen_train, mentions_train, positions_train,
            #              textlen, words, mentionlen, mentions, positions
            #              ], open(os.path.join(self.data_name + config.prep+self.portion, 'wb'))
            # print('dump preprocessed data to pkl over...')
        # else:
        # textlen_train, words_train, mentionlen_train, mentions_train, \
        # positions_train, textlen, words, mentionlen, mentions, positions = pickle.load(
        # 	open(self.data_name + config.prep+self.portion, 'rb'))
        # print('load preprocessed data from pkl over...')

        #if True:
        ss = ShuffleSplit(n_splits=1,
                          test_size=0.1,
                          random_state=config.RANDOM_SEED)
        for test_index, valid_index in ss.split(np.zeros(len(labels)),
                                                labels):  # 用index做划分
            textlen_test, textlen_valid = textlen[test_index], textlen[
                valid_index]
            words_test, words_valid = words[test_index], words[valid_index]
            mentionlen_test, mentionlen_valid = mentionlen[
                test_index], mentionlen[valid_index]
            mentions_test, mentions_valid = mentions[test_index], mentions[
                valid_index]
            positions_test, positions_valid = positions[test_index], positions[
                valid_index]
            labels_test, labels_valid = labels[test_index], labels[valid_index]

        self.train_set = list(
            zip(
                words_train,
                textlen_train,
                mentions_train,
                mentionlen_train,
                positions_train,
                labels_train,
            ))
        self.valid_set = list(
            zip(
                words_valid,
                textlen_valid,
                mentions_valid,
                mentionlen_valid,
                positions_valid,
                labels_valid,
            ))
        self.test_set = list(
            zip(
                words_test,
                textlen_test,
                mentions_test,
                mentionlen_test,
                positions_test,
                labels_test,
            ))

        self.full_test_set = list(
            zip(
                words,
                textlen,
                mentions,
                mentionlen,
                positions,
                labels,
            ))

        self.labels_test = labels_test
        self.labels = labels
        self.labels_valid = labels_valid

        self.num_types = num_types
        self.type_info = type_info
        self.logger.info("train set size:%d, test set size: %d" %
                         (len(self.train_set), len(self.full_test_set)))

        self.model = self._get_model()
        self.saver = tf.train.Saver(tf.global_variables())
        checkpoint_dir = os.path.abspath(config.CHECKPOINT_DIR)
        if not os.path.exists(checkpoint_dir):
            os.makedirs(checkpoint_dir)

        self.checkpoint_prefix = os.path.join(checkpoint_dir, self.__str__())
        "sample": 0.001,
        "window": config.EMBEDDING_WINDOW,
        "workers": config.EMBEDDING_WORKERS,
    }
    model_dir = config.DOC2VEC_MODEL_DIR
    model_name = "Quora-doc2vec-D%d-min_count%d.model" % (
        model_param["size"], model_param["min_count"])

    doc2vec = DataFrameDoc2Vec(df, columns, model_param)
    doc2vec.train()
    doc2vec.save(model_dir, model_name)


#---------------------- Main ----------------------
if __name__ == "__main__":
    df = pkl_utils._load(config.ALL_DATA_LEMMATIZED)
    columns = ["question1", "question2"]
    columns = [col for col in columns if col in df.columns]

    if len(sys.argv) >= 2:
        for w in sys.argv[1].split(","):
            if w == "word2vec":
                train_word2vec_model(df, columns)
            elif w == "doc2vec":
                train_doc2vec_model(df, columns)
            else:
                print("Skip: %s" % w)
                continue
    else:
        train_doc2vec_model(df, columns)
        train_word2vec_model(df, columns)
def preprocess(data_name, if_clean=False, full_path=False):
    if data_name == "wiki":
        raw_all_file = config.WIKI_ALL
        raw_train_file = config.WIKI_TRAIN
        raw_valid_file = config.WIKI_VALID
        raw_test_file = config.WIKI_TEST
        clean_train_file = config.WIKI_TRAIN_CLEAN
        clean_test_file = config.WIKI_TEST_CLEAN
        type_file = config.WIKI_TYPE
    elif data_name == "wikim":
        raw_all_file = config.WIKIM_ALL
        raw_train_file = config.WIKIM_TRAIN
        raw_valid_file = config.WIKIM_VALID
        raw_test_file = config.WIKIM_TEST
        clean_train_file = config.WIKIM_TRAIN_CLEAN
        clean_test_file = config.WIKIM_TEST_CLEAN
        type_file = config.WIKIM_TYPE
    elif data_name == "ontonotes":
        raw_all_file = config.ONTONOTES_ALL
        raw_train_file = config.ONTONOTES_TRAIN
        raw_valid_file = config.ONTONOTES_VALID
        raw_test_file = config.ONTONOTES_TEST
        clean_train_file = config.ONTONOTES_TRAIN_CLEAN
        clean_test_file = config.ONTONOTES_TEST_CLEAN
        type_file = config.ONTONOTES_TYPE
    else:
        raise AttributeError("Invalid data name!")

    if not os.path.exists(type_file):
        create_type_dict(raw_all_file, type_file, full_path)
    type2id, typeDict = pkl_utils._load(type_file)

    df_train = pd.read_csv(raw_train_file,
                           sep="\t",
                           names=["p1", "p2", "text", "type", "f"])
    df_valid = pd.read_csv(raw_valid_file,
                           sep="\t",
                           names=["p1", "p2", "text", "type", "f"])
    df = pd.concat((df_train, df_valid), ignore_index=True)
    size = df.shape[0]
    outfile = open(clean_train_file, "w")
    for i in range(size):
        p1 = df["p1"][i]
        p2 = df["p2"][i]
        text = df["text"][i]
        types = df["type"][i].split()
        if (not path_count(types) == 1) and if_clean:  #-> RAW
            continue

        text = clear_text(text)
        tokens = text.split()
        if p1 >= len(tokens):
            continue
        mention = " ".join(tokens[p1:p2])

        if p1 == 0:
            mention = "<PAD> " + mention
        else:
            mention = tokens[p1 - 1] + " " + mention
        if p2 >= len(tokens):
            mention = mention + " <PAD>"
        else:
            mention = mention + " " + tokens[p2]

        offset = max(0, p1 - config.WINDOW_SIZE)
        text = " ".join(tokens[offset:min(len(tokens), p2 +
                                          config.WINDOW_SIZE - 1)])
        p1 -= offset
        p2 -= offset

        out_type = []
        for a in types:
            flag = True
            for b in types:
                if len(a) >= len(b):
                    continue
                if (a == b[:len(a)]) and (b[len(a)] == "/"):
                    flag = False
            if flag:
                out_type.append(a)

        if len(out_type) > 0:
            if full_path:
                outfile.write("%d\t%d\t%s\t%s\t%s\n" %
                              (p1, p2, text, mention, " ".join(types)))
            else:
                outfile.write("%d\t%d\t%s\t%s\t%s\n" %
                              (p1, p2, text, mention, " ".join(out_type)))
    outfile.close()

    #VALIDATION separate
    df = pd.read_csv(raw_valid_file,
                     sep="\t",
                     names=["p1", "p2", "text", "type", "f"])
    outfile = open(clean_train_file.replace("train", "dev"), "w")
    size = df.shape[0]
    for i in range(size):
        p1 = df["p1"][i]
        p2 = df["p2"][i]
        text = df["text"][i]
        types = df["type"][i].split()

        text = clear_text(text)
        tokens = text.split()
        if p1 >= len(tokens):
            continue
        mention = " ".join(tokens[p1:p2])

        if p1 == 0:
            mention = "<PAD> " + mention
        else:
            mention = tokens[p1 - 1] + " " + mention
        if p2 >= len(tokens):
            mention = mention + " <PAD>"
        else:
            mention = mention + " " + tokens[p2]

        offset = max(0, p1 - config.WINDOW_SIZE)
        text = " ".join(tokens[offset:min(len(tokens), p2 +
                                          config.WINDOW_SIZE - 1)])
        p1 -= offset
        p2 -= offset

        out_type = []
        for a in types:
            flag = True
            for b in types:
                if len(a) >= len(b):
                    continue
                if (a == b[:len(a)]) and (b[len(a)] == "/"):
                    flag = False
            if flag:
                out_type.append(a)

        if full_path:
            outfile.write("%d\t%d\t%s\t%s\t%s\n" %
                          (p1, p2, text, mention, " ".join(types)))
        else:
            outfile.write("%d\t%d\t%s\t%s\t%s\n" %
                          (p1, p2, text, mention, " ".join(out_type)))
    outfile.close()

    df = pd.read_csv(raw_test_file,
                     sep="\t",
                     names=["p1", "p2", "text", "type", "f"])
    size = df.shape[0]
    outfile = open(clean_test_file, "w")
    for i in range(size):
        p1 = df["p1"][i]
        p2 = df["p2"][i]
        text = df["text"][i]
        types = df["type"][i].split()

        text = clear_text(text)
        tokens = text.split()
        if p1 >= len(tokens):
            continue
        mention = " ".join(tokens[p1:p2])

        if p1 == 0:
            mention = "<PAD> " + mention
        else:
            mention = tokens[p1 - 1] + " " + mention
        if p2 >= len(tokens):
            mention = mention + " <PAD>"
        else:
            mention = mention + " " + tokens[p2]

        offset = max(0, p1 - config.WINDOW_SIZE)
        text = " ".join(tokens[offset:min(len(tokens), p2 +
                                          config.WINDOW_SIZE - 1)])
        p1 -= offset
        p2 -= offset

        out_type = []
        for a in types:
            flag = True
            for b in types:
                if len(a) >= len(b):
                    continue
                if (a == b[:len(a)]) and (b[len(a)] == "/"):
                    flag = False
            if flag:
                out_type.append(a)

        if full_path:
            outfile.write("%d\t%d\t%s\t%s\t%s\n" %
                          (p1, p2, text, mention, " ".join(types)))
        else:
            outfile.write("%d\t%d\t%s\t%s\t%s\n" %
                          (p1, p2, text, mention, " ".join(out_type)))
    outfile.close()
from sklearn.cross_validation import check_random_state
from sklearn.cross_validation import BaseShuffleSplit, ShuffleSplit, StratifiedShuffleSplit
# http://stackoverflow.com/questions/2801882/generating-a-png-with-matplotlib-when-display-is-undefined
# The solution for me was to add the following code in a place
# that gets read before any other pylab/matplotlib/pyplot import:
import matplotlib
# Force matplotlib to not use any Xwindows backend.
matplotlib.use('Agg')
import matplotlib.pyplot as plt

import config
from utils import dist_utils, np_utils
from utils import logging_utils, os_utils, pkl_utils, time_utils
from get_stacking_feature_conf import get_model_list

splitter_level1 = pkl_utils._load("%s/splits_level1.pkl" % config.SPLIT_DIR)
splitter_level2 = pkl_utils._load("%s/splits_level2.pkl" % config.SPLIT_DIR)
splitter_level3 = pkl_utils._load("%s/splits_level3.pkl" % config.SPLIT_DIR)
assert len(splitter_level1) == len(splitter_level2)
assert len(splitter_level1) == len(splitter_level3)
n_iter = len(splitter_level1)


class StratifiedShuffleSplitReplacement(BaseShuffleSplit):
    def __init__(self,
                 y,
                 n_iter=10,
                 test_size=0.1,
                 train_size=None,
                 random_state=None):
def main():

    ###########
    ## Setup ##
    ###########
    logname = "data_processor_%s.log"%now
    logger = logging_utils._get_logger(config.LOG_DIR, logname)

    # put product_attribute_list, product_attribute and product_description first as they are
    # quite time consuming to process
    columns_to_proc = [
        # # product_attribute_list is very time consuming to process
        # # so we just process product_attribute which is of the form 
        # # attr_name1 | attr_value1 | attr_name2 | attr_value2 | ...
        # # and split it into a list afterwards
        # "product_attribute_list",
        "product_attribute_concat",
        "product_description",
        "product_brand", 
        "product_color",
        "product_title",
        "search_term", 
    ]
    if config.PLATFORM == "Linux":
        config.DATA_PROCESSOR_N_JOBS = len(columns_to_proc)

    # clean using a list of processors
    processors = [
        LowerCaseConverter(), 
        # See LowerUpperCaseSplitter and UnitConverter for why we put UnitConverter here
        UnitConverter(),
        LowerUpperCaseSplitter(), 
        WordReplacer(replace_fname=config.WORD_REPLACER_DATA), 
        LetterLetterSplitter(),
        DigitLetterSplitter(), 
        DigitCommaDigitMerger(), 
        NumberDigitMapper(),
        UnitConverter(), 
        QuartetCleaner(), 
        HtmlCleaner(parser="html.parser"), 
        Lemmatizer(),
    ]
    stemmers = [
        Stemmer(stemmer_type="snowball"), 
        Stemmer(stemmer_type="porter")
    ][0:1]

    ## simple test
    text = "1/2 inch rubber lep tips Bullet07"
    print("Original:")
    print(text)
    list_processor = ListProcessor(processors)
    print("After:")
    print(list_processor.process([text]))

    #############
    ## Process ##
    #############
    ## load raw data
    dfAll = pkl_utils._load(config.ALL_DATA_RAW)
    columns_to_proc = [col for col in columns_to_proc if col in dfAll.columns]


    ## extract product name from search_term and product_title
    ext = ProductNameExtractor()
    dfAll["search_term_product_name"] = dfAll["search_term"].apply(ext.transform)
    dfAll["product_title_product_name"] = dfAll["product_title"].apply(ext.transform)
    if config.TASK == "sample":
        print(dfAll[["search_term", "search_term_product_name", "product_title_product_name"]])


    ## clean using GoogleQuerySpellingChecker
    # MUST BE IN FRONT OF ALL THE PROCESSING
    logger.info("Run GoogleQuerySpellingChecker at search_term")
    checker = GoogleQuerySpellingChecker()
    dfAll["search_term"] = dfAll["search_term"].apply(checker.correct)


    ## clean uisng a list of processors
    df_processor = DataFrameParallelProcessor(processors, config.DATA_PROCESSOR_N_JOBS)
    df_processor.process(dfAll, columns_to_proc)
    # split product_attribute_concat into product_attribute and product_attribute_list
    dfAll["product_attribute"] = dfAll["product_attribute_concat"].apply(_split_attr_to_text)
    dfAll["product_attribute_list"] = dfAll["product_attribute_concat"].apply(_split_attr_to_list)
    if config.TASK == "sample":
        print(dfAll[["product_attribute", "product_attribute_list"]])
    # query expansion
    list_processor = ListProcessor(processors)
    base_stopwords = set(list_processor.process(list(config.STOP_WORDS)))
    qe = QueryExpansion(dfAll, ngram=3, stopwords_threshold=0.9, base_stopwords=base_stopwords)
    dfAll["search_term_alt"] = qe.build()
    if config.TASK == "sample":
        print(dfAll[["search_term", "search_term_alt"]])
    # save data
    logger.info("Save to %s"%config.ALL_DATA_LEMMATIZED)
    columns_to_save = [col for col in dfAll.columns if col != "product_attribute_concat"]
    pkl_utils._save(config.ALL_DATA_LEMMATIZED, dfAll[columns_to_save])


    ## auto correcting query
    if config.AUTO_CORRECTING_QUERY:
        logger.info("Run AutoSpellingChecker at search_term")
        checker = AutoSpellingChecker(dfAll, exclude_stopwords=False, min_len=4)
        dfAll['search_term_auto_corrected'] = list(dfAll["search_term"].apply(checker.correct))
        columns_to_proc += ['search_term_auto_corrected']
        if config.TASK == "sample":
            print(dfAll[["search_term", "search_term_auto_corrected"]])
        # save query_correction_map and spelling checker
        fname = "%s/auto_spelling_checker_query_correction_map_%s.log"%(config.LOG_DIR, now)
        checker.save_query_correction_map(fname)
        # save data
        logger.info("Save to %s"%config.ALL_DATA_LEMMATIZED)
        columns_to_save = [col for col in dfAll.columns if col != "product_attribute_concat"]
        pkl_utils._save(config.ALL_DATA_LEMMATIZED, dfAll[columns_to_save])


    ## clean using stemmers
    df_processor = DataFrameParallelProcessor(stemmers, config.DATA_PROCESSOR_N_JOBS)
    df_processor.process(dfAll, columns_to_proc)
    # split product_attribute_concat into product_attribute and product_attribute_list
    dfAll["product_attribute"] = dfAll["product_attribute_concat"].apply(_split_attr_to_text)
    dfAll["product_attribute_list"] = dfAll["product_attribute_concat"].apply(_split_attr_to_list)
    # query expansion
    list_processor = ListProcessor(stemmers)
    base_stopwords = set(list_processor.process(list(config.STOP_WORDS)))
    qe = QueryExpansion(dfAll, ngram=3, stopwords_threshold=0.9, base_stopwords=base_stopwords)
    dfAll["search_term_alt"] = qe.build()
    if config.TASK == "sample":
        print(dfAll[["search_term", "search_term_alt"]])
    # save data
    logger.info("Save to %s"%config.ALL_DATA_LEMMATIZED_STEMMED)
    columns_to_save = [col for col in dfAll.columns if col != "product_attribute_concat"]
    pkl_utils._save(config.ALL_DATA_LEMMATIZED_STEMMED, dfAll[columns_to_save])
Exemple #31
0
def main():

    ###########
    ## Setup ##
    ###########
    logname = "data_processor_%s.log"%now
    logger = logging_utils._get_logger(config.LOG_DIR, logname)

    # put product_attribute_list, product_attribute and product_description first as they are
    # quite time consuming to process
    columns_to_proc = [
        # # product_attribute_list is very time consuming to process
        # # so we just process product_attribute which is of the form 
        # # attr_name1 | attr_value1 | attr_name2 | attr_value2 | ...
        # # and split it into a list afterwards
        # "product_attribute_list",
        "product_attribute_concat",
        "product_description",
        "product_brand", 
        "product_color",
        "product_title",
        "search_term", 
    ]
    if config.PLATFORM == "Linux":
        config.DATA_PROCESSOR_N_JOBS = len(columns_to_proc)

    # clean using a list of processors
    processors = [
        LowerCaseConverter(), 
        # See LowerUpperCaseSplitter and UnitConverter for why we put UnitConverter here
        UnitConverter(),
        LowerUpperCaseSplitter(), 
        WordReplacer(replace_fname=config.WORD_REPLACER_DATA), 
        LetterLetterSplitter(),
        DigitLetterSplitter(), 
        DigitCommaDigitMerger(), 
        NumberDigitMapper(),
        UnitConverter(), 
        QuartetCleaner(), 
        HtmlCleaner(parser="html.parser"), 
        Lemmatizer(),
    ]
    stemmers = [
        Stemmer(stemmer_type="snowball"), 
        Stemmer(stemmer_type="porter")
    ][0:1]

    ## simple test
    text = "1/2 inch rubber lep tips Bullet07"
    print("Original:")
    print(text)
    list_processor = ListProcessor(processors)
    print("After:")
    print(list_processor.process([text]))

    #############
    ## Process ##
    #############
    ## load raw data
    dfAll = pkl_utils._load(config.ALL_DATA_RAW)
    columns_to_proc = [col for col in columns_to_proc if col in dfAll.columns]


    ## extract product name from search_term and product_title
    ext = ProductNameExtractor()
    dfAll["search_term_product_name"] = dfAll["search_term"].apply(ext.transform)
    dfAll["product_title_product_name"] = dfAll["product_title"].apply(ext.transform)
    if config.TASK == "sample":
        print(dfAll[["search_term", "search_term_product_name", "product_title_product_name"]])


    ## clean using GoogleQuerySpellingChecker
    # MUST BE IN FRONT OF ALL THE PROCESSING
    if config.GOOGLE_CORRECTING_QUERY:
        logger.info("Run GoogleQuerySpellingChecker at search_term")
        checker = GoogleQuerySpellingChecker()
        dfAll["search_term"] = dfAll["search_term"].apply(checker.correct)


    ## clean uisng a list of processors
    df_processor = DataFrameParallelProcessor(processors, config.DATA_PROCESSOR_N_JOBS)
    df_processor.process(dfAll, columns_to_proc)
    # split product_attribute_concat into product_attribute and product_attribute_list
    dfAll["product_attribute"] = dfAll["product_attribute_concat"].apply(_split_attr_to_text)
    dfAll["product_attribute_list"] = dfAll["product_attribute_concat"].apply(_split_attr_to_list)
    if config.TASK == "sample":
        print(dfAll[["product_attribute", "product_attribute_list"]])
    # query expansion
    if config.QUERY_EXPANSION:
        list_processor = ListProcessor(processors)
        base_stopwords = set(list_processor.process(list(config.STOP_WORDS)))
        qe = QueryExpansion(dfAll, ngram=3, stopwords_threshold=0.9, base_stopwords=base_stopwords)
        dfAll["search_term_alt"] = qe.build()
        if config.TASK == "sample":
            print(dfAll[["search_term", "search_term_alt"]])
    # save data
    logger.info("Save to %s"%config.ALL_DATA_LEMMATIZED)
    columns_to_save = [col for col in dfAll.columns if col != "product_attribute_concat"]
    pkl_utils._save(config.ALL_DATA_LEMMATIZED, dfAll[columns_to_save])


    ## auto correcting query
    if config.AUTO_CORRECTING_QUERY:
        logger.info("Run AutoSpellingChecker at search_term")
        checker = AutoSpellingChecker(dfAll, exclude_stopwords=False, min_len=4)
        dfAll["search_term_auto_corrected"] = list(dfAll["search_term"].apply(checker.correct))
        columns_to_proc += ["search_term_auto_corrected"]
        if config.TASK == "sample":
            print(dfAll[["search_term", "search_term_auto_corrected"]])
        # save query_correction_map and spelling checker
        fname = "%s/auto_spelling_checker_query_correction_map_%s.log"%(config.LOG_DIR, now)
        checker.save_query_correction_map(fname)
        # save data
        logger.info("Save to %s"%config.ALL_DATA_LEMMATIZED)
        columns_to_save = [col for col in dfAll.columns if col != "product_attribute_concat"]
        pkl_utils._save(config.ALL_DATA_LEMMATIZED, dfAll[columns_to_save])


    ## clean using stemmers
    df_processor = DataFrameParallelProcessor(stemmers, config.DATA_PROCESSOR_N_JOBS)
    df_processor.process(dfAll, columns_to_proc)
    # split product_attribute_concat into product_attribute and product_attribute_list
    dfAll["product_attribute"] = dfAll["product_attribute_concat"].apply(_split_attr_to_text)
    dfAll["product_attribute_list"] = dfAll["product_attribute_concat"].apply(_split_attr_to_list)
    # query expansion
    if config.QUERY_EXPANSION:
        list_processor = ListProcessor(stemmers)
        base_stopwords = set(list_processor.process(list(config.STOP_WORDS)))
        qe = QueryExpansion(dfAll, ngram=3, stopwords_threshold=0.9, base_stopwords=base_stopwords)
        dfAll["search_term_alt"] = qe.build()
        if config.TASK == "sample":
            print(dfAll[["search_term", "search_term_alt"]])
    # save data
    logger.info("Save to %s"%config.ALL_DATA_LEMMATIZED_STEMMED)
    columns_to_save = [col for col in dfAll.columns if col != "product_attribute_concat"]
    pkl_utils._save(config.ALL_DATA_LEMMATIZED_STEMMED, dfAll[columns_to_save])
    def combine(self):
        # combine meta features
        if self.meta_feature_dict:
            cb = Combiner(feature_dict=self.meta_feature_dict,
                          feature_name=self.feature_name,
                          feature_suffix=".pkl",
                          corr_threshold=self.corr_threshold)
            cb.combine()
            self.X_train_basic = cb.X_train
            self.X_test_basic = cb.X_test
            self.feature_names_basic = cb.feature_names_basic
            self.feature_names.extend(cb.feature_names)
        else:
            self.X_train_basic = None
            self.X_test_basic = None

        # combine other features
        dfAll = pkl_utils._load(config.INFO_DATA)

        dfTrain = dfAll.iloc[:TRAIN_SIZE].copy()

        dfTest = dfAll.iloc[TRAIN_SIZE:].copy()
        self.id_test = dfTest["id"].values.astype(int)

        ## all
        first = True
        feat_cnt = 0
        feature_dir = "%s/All" % (config.OUTPUT_DIR)
        for file_name in sorted(os.listdir(feature_dir)):
            if self.feature_suffix in file_name:
                ## NOTE the name is valid.pred.MODEL.csv and test.pred.MODEL.csv
                fname = file_name.split(".")[2]
                if fname not in self.feature_list:
                    continue
                if first:
                    self.logger.info("Run for all...")
                    first = False
                # load prediction
                x = self.load_feature(feature_dir, "test.pred." + fname)
                x = np.nan_to_num(x)
                dim = np_utils._dim(x)
                dfTest[fname] = x
                feat_cnt += 1
                self.feature_names_cv.append(fname)
                self.feature_names.append(fname)
                self.logger.info("Combine {:>3}/{:>3} feat: {} ({}D)".format(
                    feat_cnt, len(self.feature_list), fname, dim))
                # load probability if any
                try:
                    x = self.load_feature(feature_dir,
                                          "test.proba." + fname,
                                          columns=None,
                                          columns_pattern="proba")
                    x = np.nan_to_num(x)
                    dim = np_utils._dim(x)
                    for i in range(dim):
                        dfTest["%s_proba%d" % (fname, i)] = x[:, i]
                    self.logger.info(
                        "Combine {:>3}/{:>3} proba feat: {} ({}D)".format(
                            feat_cnt, len(self.feature_list), fname, dim))
                    self.feature_names.extend(
                        ["%s_proba%d" % (fname, i) for i in range(dim)])
                except:
                    pass

        dfTest.fillna(config.MISSING_VALUE_NUMERIC, inplace=True)
        self.X_test = dfTest.drop(["id", "relevance"],
                                  axis=1).values.astype(float)
        if self.meta_feature_dict:
            self.X_test = np.hstack([self.X_test_basic, self.X_test])

        ## for cv features
        first = True
        for run in range(1, self.n_iter + 1):
            feat_cnt = 0
            idx1 = splitter_level1[run - 1][1]
            idx2 = splitter_level2[run - 1][1]
            if self.feature_level == 2:
                idx = idx1
            elif self.feature_level == 3:
                idx = [idx1[i] for i in idx2]
            self.splitter_prev[run - 1] = idx
            dfTrain_cv = dfTrain.iloc[idx].copy()
            feature_dir = "%s/Run%d" % (config.OUTPUT_DIR, run)
            for file_name in sorted(os.listdir(feature_dir)):
                if self.feature_suffix in file_name:
                    ## NOTE the name is valid.pred.MODEL.csv and test.pred.MODEL.csv
                    fname = file_name.split(".")[2]
                    if (fname not in self.feature_list) or (
                            fname not in self.feature_names_cv):
                        continue
                    if first:
                        self.logger.info("Run for cv...")
                        first = False
                    if feat_cnt == 0:
                        self.logger.info("Run %d" % run)
                    # load prediction
                    x = self.load_feature(feature_dir, "valid.pred." + fname)
                    x = np.nan_to_num(x)
                    dim = np_utils._dim(x)
                    # also including level 1 models' preditions
                    if x.shape[0] > len(idx):
                        x = x[idx2]
                    dfTrain_cv[fname] = x
                    feat_cnt += 1
                    self.logger.info(
                        "Combine {:>3}/{:>3} feat: {} ({}D)".format(
                            feat_cnt, len(self.feature_list), fname, dim))
                    # load probability if any
                    try:
                        x = self.load_feature(feature_dir,
                                              "valid.proba." + fname,
                                              columns=None,
                                              columns_pattern="proba")
                        x = np.nan_to_num(x)
                        dim = np_utils._dim(x)
                        # also including level 1 models' preditions
                        if x.shape[0] > len(idx):
                            x = x[idx2]
                        for i in range(dim):
                            dfTrain_cv["%s_proba%d" % (fname, i)] = x[:, i]
                        self.logger.info(
                            "Combine {:>3}/{:>3} proba feat: {} ({}D)".format(
                                feat_cnt, len(self.feature_list), fname, dim))
                    except:
                        pass

            dfTrain_cv.fillna(config.MISSING_VALUE_NUMERIC, inplace=True)
            if run == 1:
                self.y_train_cv = [0] * self.n_iter
                self.X_train_cv = [0] * self.n_iter
            self.y_train_cv[run -
                            1] = dfTrain_cv["relevance"].values.astype(float)
            self.X_train_cv[run - 1] = dfTrain_cv.drop(
                ["id", "relevance"], axis=1).values.astype(float)

        if self.has_basic:
            self.logger.info("Overall Shape: %d x %d" %
                             (len(self.y_train_cv[self.n_iter - 1]),
                              self.X_train_basic.shape[1] +
                              self.X_train_cv[self.n_iter - 1].shape[1]))
        else:
            self.logger.info("Overall Shape: %d x %d" %
                             (len(self.y_train_cv[self.n_iter - 1]),
                              self.X_train_cv[self.n_iter - 1].shape[1]))
        self.logger.info("Done combinning.")

        return self
Exemple #33
0
 def _load_data_dict(self):
     fname = os.path.join(config.FEAT_DIR+"/Combine", self.feature_name+config.FEAT_FILE_SUFFIX)
     data_dict = pkl_utils._load(fname)
     return data_dict
Exemple #34
0
def main():
    logname = "generate_feature_group_distance_stat_%s.log" % time_utils._timestamp(
    )
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED)
    y_train = dfAll["relevance"].values[:TRAIN_SIZE]

    group_id_names = [
        "DocId_search_term", "DocId_product_title", "DocIdEcho_product_uid"
    ]

    match_list = [
        "MatchQueryCount",
        "MatchQueryRatio",
        "LongestMatchRatio",
    ]

    tfidf_list = [
        "StatCoocTF_Unigram_Mean",
        "StatCoocTF_Unigram_Max",
        "StatCoocTF_Unigram_Min",
        # "StatCoocNormTF_Unigram_Mean",
        # "StatCoocNormTF_Unigram_Max",
        # "StatCoocNormTF_Unigram_Min",
        "StatCoocTFIDF_Unigram_Mean",
        "StatCoocTFIDF_Unigram_Max",
        "StatCoocTFIDF_Unigram_Min",
        "StatCoocBM25_Unigram_Mean",
        "StatCoocBM25_Unigram_Max",
        "StatCoocBM25_Unigram_Min",
        # "StatCoocTF_Bigram_Mean",
        # "StatCoocTF_Bigram_Max",
        # "StatCoocTF_Bigram_Min",
        # "StatCoocNormTF_Bigram_Mean",
        # "StatCoocNormTF_Bigram_Max",
        # "StatCoocNormTF_Bigram_Min",
        # "StatCoocTFIDF_Bigram_Mean",
        # "StatCoocTFIDF_Bigram_Max",
        # "StatCoocTFIDF_Bigram_Min",
        # "StatCoocBM25_Bigram_Mean",
        # "StatCoocBM25_Bigram_Max",
        # "StatCoocBM25_Bigram_Min",
        # "StatCoocTF_Trigram_Mean",
        # "StatCoocTF_Trigram_Max",
        # "StatCoocTF_Trigram_Min",
        # "StatCoocNormTF_Trigram_Mean",
        # "StatCoocNormTF_Trigram_Max",
        # "StatCoocNormTF_Trigram_Min",
        # "StatCoocTFIDF_Trigram_Mean",
        # "StatCoocTFIDF_Trigram_Max",
        # "StatCoocTFIDF_Trigram_Min",
        # "StatCoocBM25_Trigram_Mean",
        # "StatCoocBM25_Trigram_Max",
        # "StatCoocBM25_Trigram_Min",
    ]
    intersect_ngram_count_list = [
        "IntersectCount_Unigram",
        "IntersectRatio_Unigram",
        # "IntersectCount_Bigram",
        # "IntersectRatio_Bigram",
        # "IntersectCount_Trigram",
        # "IntersectRatio_Trigram",
    ]
    first_last_ngram_list = [
        "FirstIntersectCount_Unigram",
        "FirstIntersectRatio_Unigram",
        "LastIntersectCount_Unigram",
        "LastIntersectRatio_Unigram",
        # "FirstIntersectCount_Bigram",
        # "FirstIntersectRatio_Bigram",
        # "LastIntersectCount_Bigram",
        # "LastIntersectRatio_Bigram",
        # "FirstIntersectCount_Trigram",
        # "FirstIntersectRatio_Trigram",
        # "LastIntersectCount_Trigram",
        # "LastIntersectRatio_Trigram",
    ]

    cooccurrence_ngram_count_list = [
        "CooccurrenceCount_Unigram",
        "CooccurrenceRatio_Unigram",
        # "CooccurrenceCount_Bigram",
        # "CooccurrenceRatio_Bigram",
        # "CooccurrenceCount_Trigram",
        # "CooccurrenceRatio_Trigram",
    ]

    ngram_jaccard_list = [
        "JaccardCoef_Unigram",
        # "JaccardCoef_Bigram",
        # "JaccardCoef_Trigram",
        "DiceDistance_Unigram",
        # "DiceDistance_Bigram",
        # "DiceDistance_Trigram",
    ]

    char_dist_sim_list = [
        "CharDistribution_CosineSim",
        "CharDistribution_KL",
    ]

    tfidf_word_ngram_cosinesim_list = [
        "TFIDF_Word_Unigram_CosineSim",
        # "TFIDF_Word_Bigram_CosineSim",
        # "TFIDF_Word_Trigram_CosineSim",
    ]
    tfidf_char_ngram_cosinesim_list = [
        # "TFIDF_Char_Bigram_CosineSim",
        # "TFIDF_Char_Trigram_CosineSim",
        "TFIDF_Char_Fourgram_CosineSim",
        # "TFIDF_Char_Fivegram_CosineSim",
    ]

    lsa_word_ngram_cosinesim_list = [
        "LSA100_Word_Unigram_CosineSim",
        # "LSA100_Word_Bigram_CosineSim",
        # "LSA100_Word_Trigram_CosineSim",
    ]
    lsa_char_ngram_cosinesim_list = [
        # "LSA100_Char_Bigram_CosineSim",
        # "LSA100_Char_Trigram_CosineSim",
        "LSA100_Char_Fourgram_CosineSim",
        # "LSA100_Char_Fivegram_CosineSim",
    ]

    doc2vec_list = [
        "Doc2Vec_Homedepot_D100_CosineSim",
    ]

    word2vec_list = [
        "Word2Vec_N_Similarity",
        "Word2Vec_Homedepot_D100_CosineSim_Mean_Mean",
        "Word2Vec_Homedepot_D100_CosineSim_Max_Mean",
        "Word2Vec_Homedepot_D100_CosineSim_Min_Mean",
    ]

    distance_generator_list = \
    match_list + \
    tfidf_list + \
    intersect_ngram_count_list + \
    first_last_ngram_list + \
    cooccurrence_ngram_count_list + \
    ngram_jaccard_list + \
    tfidf_word_ngram_cosinesim_list + \
    tfidf_char_ngram_cosinesim_list + \
    lsa_word_ngram_cosinesim_list + \
    lsa_char_ngram_cosinesim_list + \
    char_dist_sim_list + \
    word2vec_list + \
    doc2vec_list

    obs_fields_list = []
    target_fields_list = []
    ## query in document
    obs_fields_list.append(["search_term"])
    target_fields_list.append(["product_title", "product_title_product_name"])
    aggregation_mode = ["mean", "max", "min"]
    for group_id_name in group_id_names:
        group_id_list = pkl_utils._load(
            os.path.join(config.FEAT_DIR, group_id_name + "_1D.pkl"))
        for distance_generator in distance_generator_list:
            for obs_fields, target_fields in zip(obs_fields_list,
                                                 target_fields_list):
                for obs_field in obs_fields:
                    for target_field in target_fields:
                        dist_name = "%s_%s_x_%s" % (distance_generator,
                                                    obs_field, target_field)
                        try:
                            dist_list = pkl_utils._load(
                                os.path.join(config.FEAT_DIR,
                                             dist_name + "_1D.pkl"))
                            ext = GroupDistanceStat(dist_list, group_id_list,
                                                    dist_name, group_id_name,
                                                    aggregation_mode)
                            x = ext.transform()
                            if isinstance(ext.__name__(), list):
                                for i, feat_name in enumerate(ext.__name__()):
                                    dim = 1
                                    fname = "%s_%dD" % (feat_name, dim)
                                    pkl_utils._save(
                                        os.path.join(
                                            config.FEAT_DIR,
                                            fname + config.FEAT_FILE_SUFFIX),
                                        x[:, i])
                                    corr = np_utils._corr(
                                        x[:TRAIN_SIZE, i], y_train)
                                    logger.info("%s (%dD): corr = %.6f" %
                                                (fname, dim, corr))
                        except:
                            logger.info("Skip %s" % dist_name)
                            pass
from sklearn.cross_validation import BaseShuffleSplit, ShuffleSplit, StratifiedShuffleSplit
# http://stackoverflow.com/questions/2801882/generating-a-png-with-matplotlib-when-display-is-undefined
# The solution for me was to add the following code in a place 
# that gets read before any other pylab/matplotlib/pyplot import:
import matplotlib
# Force matplotlib to not use any Xwindows backend.
matplotlib.use('Agg')
import matplotlib.pyplot as plt

import config
from utils import dist_utils, np_utils
from utils import logging_utils, os_utils, pkl_utils, time_utils
from get_stacking_feature_conf import get_model_list


splitter_level1 = pkl_utils._load("%s/splits_level1.pkl"%config.SPLIT_DIR)
splitter_level2 = pkl_utils._load("%s/splits_level2.pkl"%config.SPLIT_DIR)
splitter_level3 = pkl_utils._load("%s/splits_level3.pkl"%config.SPLIT_DIR)
assert len(splitter_level1) == len(splitter_level2)
assert len(splitter_level1) == len(splitter_level3)
n_iter = len(splitter_level1)


class StratifiedShuffleSplitReplacement(BaseShuffleSplit):

    def __init__(self, y, n_iter=10, test_size=0.1, train_size=None,
                 random_state=None):

        super(StratifiedShuffleSplitReplacement, self).__init__(
            len(y), n_iter, test_size, train_size, random_state)
def main(which):
    logname = "generate_feature_word2vec_%s_%s.log"%(which, time_utils._timestamp())
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    #### NOTE: use data BEFORE STEMMinG
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED)

    word2vec_model_dirs = []
    model_prefixes = []
    if which == "homedepot":
        ## word2vec model trained with Homedepot dataset: brand/color/query/title/description
        word2vec_model_dirs.append( config.WORD2VEC_MODEL_DIR + "/Homedepot-word2vec-D%d-min_count%d.model"%(config.EMBEDDING_DIM, config.EMBEDDING_MIN_COUNT) )
        model_prefixes.append( "Homedepot" )
    elif which == "wikipedia":
        ## word2vec model pretrained with Wikipedia+Gigaword 5
        word2vec_model_dirs.append( config.GLOVE_WORD2VEC_MODEL_DIR + "/glove.6B.300d.txt" )
        model_prefixes.append( "Wikipedia" )
    elif which == "google":
        ## word2vec model pretrained with Google News
        word2vec_model_dirs.append( config.WORD2VEC_MODEL_DIR + "/GoogleNews-vectors-negative300.bin" )
        model_prefixes.append( "GoogleNews" )

    for word2vec_model_dir, model_prefix in zip(word2vec_model_dirs, model_prefixes):
        ## load model
        try:
            if ".bin" in word2vec_model_dir:
                word2vec_model = gensim.models.Word2Vec.load_word2vec_format(word2vec_model_dir, binary=True)
            elif ".txt" in word2vec_model_dir:
                word2vec_model = gensim.models.Word2Vec.load_word2vec_format(word2vec_model_dir, binary=False)
            else:
                word2vec_model = gensim.models.Word2Vec.load(word2vec_model_dir)
        except:
            continue

        # ## standalone (not used in model building)
        # obs_fields = ["search_term", "product_title", "product_description"]
        # generator = Word2Vec_Centroid_Vector
        # param_list = [word2vec_model, model_prefix]
        # sf = StandaloneFeatureWrapper(generator, dfAll, obs_fields, param_list, config.FEAT_DIR, logger)
        # sf.go()

        ## pairwise
        generators = [
            Word2Vec_Importance,
            Word2Vec_N_Similarity, 
            Word2Vec_N_Similarity_Imp, 
            Word2Vec_Centroid_RMSE, 
            Word2Vec_Centroid_RMSE_IMP,
            # # not used in final submission
            # Word2Vec_Centroid_Vdiff, 
        ]
        obs_fields_list = []
        target_fields_list = []
        obs_fields_list.append( ["search_term", "search_term_alt", "search_term_auto_corrected"][:1] )
        target_fields_list.append( ["product_title", "product_description", "product_attribute", "product_brand", "product_color"] )
        for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
            for generator in generators:
                param_list = [word2vec_model, model_prefix]
                pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger)
                pf.go()

        ## cosine sim
        generators = [
            Word2Vec_CosineSim,
        ]
        # double aggregation
        aggregation_mode_prev = ["mean", "max", "min", "median"]
        aggregation_mode = ["mean", "std", "max", "min", "median"]
        for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
            for generator in generators:
                param_list = [word2vec_model, model_prefix, aggregation_mode, aggregation_mode_prev]
                pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields, target_fields, param_list, config.FEAT_DIR, logger)
                pf.go()
    def combine(self):
        # combine meta features
        if self.meta_feature_dict:
            cb = Combiner(feature_dict=self.meta_feature_dict, 
                        feature_name=self.feature_name, 
                        feature_suffix=".pkl", 
                        corr_threshold=self.corr_threshold)
            cb.combine()
            self.X_train_basic = cb.X_train
            self.X_test_basic = cb.X_test
            self.feature_names_basic = cb.feature_names_basic
            self.feature_names.extend(cb.feature_names)
        else:
            self.X_train_basic = None
            self.X_test_basic = None

        # combine other features
        dfAll = pkl_utils._load(config.INFO_DATA)

        dfTrain = dfAll.iloc[:TRAIN_SIZE].copy()

        dfTest = dfAll.iloc[TRAIN_SIZE:].copy()
        self.id_test = dfTest["id"].values.astype(int)

        ## all
        first = True
        feat_cnt = 0
        feature_dir = "%s/All" % (config.OUTPUT_DIR)
        for file_name in sorted(os.listdir(feature_dir)):
            if self.feature_suffix in file_name:
                ## NOTE the name is valid.pred.MODEL.csv and test.pred.MODEL.csv
                fname = file_name.split(".")[2]
                if fname not in self.feature_list:
                    continue
                if first:
                    self.logger.info("Run for all...")
                    first = False
                # load prediction
                x = self.load_feature(feature_dir, "test.pred."+fname)
                x = np.nan_to_num(x)
                dim = np_utils._dim(x)
                dfTest[fname] = x
                feat_cnt += 1
                self.feature_names_cv.append(fname)
                self.feature_names.append(fname)
                self.logger.info("Combine {:>3}/{:>3} feat: {} ({}D)".format(
                    feat_cnt, len(self.feature_list), fname, dim))
                # load probability if any
                try:
                    x = self.load_feature(feature_dir, "test.proba."+fname, 
                                        columns=None, columns_pattern="proba")
                    x = np.nan_to_num(x)
                    dim = np_utils._dim(x)
                    for i in range(dim):
                        dfTest["%s_proba%d"%(fname, i)] = x[:,i]
                    self.logger.info("Combine {:>3}/{:>3} proba feat: {} ({}D)".format(
                        feat_cnt, len(self.feature_list), fname, dim))
                    self.feature_names.extend(["%s_proba%d"%(fname, i) for i in range(dim)])
                except:
                    pass

        dfTest.fillna(config.MISSING_VALUE_NUMERIC, inplace=True)
        self.X_test = dfTest.drop(["id","relevance"], axis=1).values.astype(float)
        if self.meta_feature_dict:
            self.X_test = np.hstack([self.X_test_basic, self.X_test])

        ## for cv features
        first = True
        for run in range(1,self.n_iter+1):
            feat_cnt = 0
            idx1 = splitter_level1[run-1][1]
            idx2 = splitter_level2[run-1][1]
            if self.feature_level == 2:
                idx = idx1
            elif self.feature_level == 3:
                idx = [ idx1[i] for i in idx2 ]
            self.splitter_prev[run-1] = idx
            dfTrain_cv = dfTrain.iloc[idx].copy()
            feature_dir = "%s/Run%d" % (config.OUTPUT_DIR, run)
            for file_name in sorted(os.listdir(feature_dir)):
                if self.feature_suffix in file_name:
                    ## NOTE the name is valid.pred.MODEL.csv and test.pred.MODEL.csv
                    fname = file_name.split(".")[2]
                    if (fname not in self.feature_list) or (fname not in self.feature_names_cv):
                        continue
                    if first:
                        self.logger.info("Run for cv...")
                        first = False
                    if feat_cnt == 0:
                        self.logger.info("Run %d"%run)
                    # load prediction
                    x = self.load_feature(feature_dir, "valid.pred."+fname)
                    x = np.nan_to_num(x)
                    dim = np_utils._dim(x)
                    # also including level 1 models' preditions
                    if x.shape[0] > len(idx):
                        x = x[idx2]
                    dfTrain_cv[fname] = x
                    feat_cnt += 1
                    self.logger.info("Combine {:>3}/{:>3} feat: {} ({}D)".format(
                        feat_cnt, len(self.feature_list), fname, dim))
                    # load probability if any
                    try:
                        x = self.load_feature(feature_dir, "valid.proba."+fname, 
                                            columns=None, columns_pattern="proba")
                        x = np.nan_to_num(x)
                        dim = np_utils._dim(x)
                        # also including level 1 models' preditions
                        if x.shape[0] > len(idx):
                            x = x[idx2]
                        for i in range(dim):
                            dfTrain_cv["%s_proba%d"%(fname, i)] = x[:,i]
                        self.logger.info("Combine {:>3}/{:>3} proba feat: {} ({}D)".format(
                            feat_cnt, len(self.feature_list), fname, dim))
                    except:
                        pass

            dfTrain_cv.fillna(config.MISSING_VALUE_NUMERIC, inplace=True)
            if run == 1:
                self.y_train_cv = [0]*self.n_iter
                self.X_train_cv = [0]*self.n_iter
            self.y_train_cv[run-1] = dfTrain_cv["relevance"].values.astype(float)
            self.X_train_cv[run-1] = dfTrain_cv.drop(["id","relevance"], axis=1).values.astype(float)

        if self.has_basic:
            self.logger.info("Overall Shape: %d x %d"%(
                len(self.y_train_cv[self.n_iter-1]), self.X_train_basic.shape[1] + self.X_train_cv[self.n_iter-1].shape[1]))
        else:
            self.logger.info("Overall Shape: %d x %d"%(
                len(self.y_train_cv[self.n_iter-1]), self.X_train_cv[self.n_iter-1].shape[1]))
        self.logger.info("Done combinning.")
        
        return self
 def load_feature(self, feature_dir, feature_name):
     fname = os.path.join(feature_dir, feature_name+self.feature_suffix)
     return pkl_utils._load(fname)
    def combine(self):

        dfAll = pkl_utils._load(config.INFO_DATA)
        dfAll_raw = dfAll.copy()
        y_train = dfAll["relevance"].values[:TRAIN_SIZE]

        ## for basic features
        feat_cnt = 0
        self.logger.info("Run for basic...")
        for file_name in sorted(os.listdir(config.FEAT_DIR)):
            if self.feature_suffix in file_name:
                fname = file_name.split(".")[0]
                if fname not in self.feature_dict:
                    continue
                x = self.load_feature(config.FEAT_DIR, fname)
                x = np.nan_to_num(x)
                if np.isnan(x).any():
                    self.logger.info("%s nan"%fname)
                    continue
                # apply feature transform
                mandatory = self.feature_dict[fname][0]
                transformer = self.feature_dict[fname][1]
                x = transformer.fit_transform(x)
                dim = np_utils._dim(x)
                if dim == 1:
                    corr = np_utils._corr(x[:TRAIN_SIZE], y_train)
                    if not mandatory and abs(corr) < self.corr_threshold:
                        self.logger.info("Drop: {} ({}D) (abs corr = {}, < threshold = {})".format(
                            fname, dim, abs(corr), self.corr_threshold))
                        continue
                    dfAll[fname] = x
                    self.feature_names.append(fname)
                else:
                    columns = ["%s_%d"%(fname, x) for x in range(dim)]
                    df = pd.DataFrame(x, columns=columns)
                    dfAll = pd.concat([dfAll, df], axis=1)
                    self.feature_names.extend(columns)
                feat_cnt += 1
                self.feature_names_basic.append(fname)
                if dim == 1:
                    self.logger.info("Combine {:>3}/{:>3} feat: {} ({}D) (corr = {})".format(
                        feat_cnt, len(self.feature_dict.keys()), fname, dim, corr))
                else:
                    self.logger.info("Combine {:>3}/{:>3} feat: {} ({}D)".format(
                        feat_cnt, len(self.feature_dict.keys()), fname, dim))
        dfAll.fillna(config.MISSING_VALUE_NUMERIC, inplace=True)
        ## basic
        dfTrain = dfAll.iloc[:TRAIN_SIZE].copy()
        self.y_train = dfTrain["relevance"].values.astype(float)
        dfTrain.drop(["id","relevance"], axis=1, inplace=True)
        self.X_train = dfTrain.values.astype(float)

        dfTest = dfAll.iloc[TRAIN_SIZE:].copy()
        self.id_test = dfTest["id"].values.astype(int)
        dfTest.drop(["id","relevance"], axis=1, inplace=True)
        self.X_test = dfTest.values.astype(float)

        ## all
        first = True
        feat_cv_cnt = 0
        dfAll_cv_all = dfAll_raw.copy()
        feature_dir = "%s/All" % (config.FEAT_DIR)
        for file_name in sorted(os.listdir(feature_dir)):
            if self.feature_suffix in file_name:
                fname = file_name.split(".")[0]
                if fname not in self.feature_dict:
                    continue
                if first:
                    self.logger.info("Run for all...")
                    first = False
                x = self.load_feature(feature_dir, fname)
                x = np.nan_to_num(x)
                if np.isnan(x).any():
                    self.logger.info("%s nan"%fname)
                    continue
                # apply feature transform
                mandatory = self.feature_dict[fname][0]
                transformer = self.feature_dict[fname][1]
                x = transformer.fit_transform(x)
                dim = np_utils._dim(x)
                if dim == 1:
                    corr = np_utils._corr(x[:TRAIN_SIZE], y_train)
                    if not mandatory and abs(corr) < self.corr_threshold:
                        self.logger.info("Drop: {} ({}D) (abs corr = {}, < threshold = {})".format(
                            fname, dim, abs(corr), self.corr_threshold))
                        continue
                    dfAll_cv_all[fname] = x
                    self.feature_names.append(fname)
                else:
                    columns = ["%s_%d"%(fname, x) for x in range(dim)]
                    df = pd.DataFrame(x, columns=columns)
                    dfAll_cv_all = pd.concat([dfAll_cv_all, df], axis=1)
                    self.feature_names.extend(columns)
                feat_cv_cnt += 1
                self.feature_names_cv.append(fname)
                if dim == 1:
                    self.logger.info("Combine {:>3}/{:>3} feat: {} ({}D) (corr = {})".format(
                        feat_cnt+feat_cv_cnt, len(self.feature_dict.keys()), fname, dim, corr))
                else:
                    self.logger.info("Combine {:>3}/{:>3} feat: {} ({}D)".format(
                        feat_cnt+feat_cv_cnt, len(self.feature_dict.keys()), fname, dim))
        if feat_cv_cnt > 0:
            dfAll_cv_all.fillna(config.MISSING_VALUE_NUMERIC, inplace=True)
            X_tmp = dfAll_cv_all.drop(["id","relevance"], axis=1).values.astype(float)
            self.X_train_cv_all = X_tmp[:TRAIN_SIZE]
            self.X_test = np.hstack((self.X_test, X_tmp[TRAIN_SIZE:]))
        else:
            self.X_train_cv_all = None
        feat_cnt += feat_cv_cnt

        ## for cv features
        first = True
        for run in range(1,self.n_iter+1):
            feat_cv_cnt = 0
            dfAll_cv = dfAll_raw.copy()
            feature_dir = "%s/Run%d" % (config.FEAT_DIR, run)
            for file_name in sorted(os.listdir(feature_dir)):
                if self.feature_suffix in file_name:
                    fname = file_name.split(".")[0]
                    if (fname not in self.feature_dict) or (fname not in self.feature_names_cv):
                        continue
                    if first:
                        self.logger.info("Run for cv...")
                        first = False
                    if feat_cv_cnt == 0:
                        self.logger.info("Run %d"%run)
                    x = self.load_feature(feature_dir, fname)
                    x = np.nan_to_num(x)
                    if np.isnan(x).any():
                        self.logger.info("%s nan"%fname)
                        continue
                    # apply feature transform
                    mandatory = self.feature_dict[fname][0]
                    transformer = self.feature_dict[fname][1]
                    x = transformer.fit_transform(x)
                    dim = np_utils._dim(x)
                    if dim == 1:
                        dfAll_cv[fname] = x
                    else:
                        columns = ["%s_%d"%(fname, x) for x in range(dim)]
                        df = pd.DataFrame(x, columns=columns)
                        dfAll_cv = pd.concat([dfAll_cv, df], axis=1)
                    feat_cv_cnt += 1
                    self.logger.info("Combine {:>3}/{:>3} feat: {} ({}D)".format(
                        feat_cnt+feat_cv_cnt, len(self.feature_dict.keys()), fname, dim))
            if feat_cv_cnt > 0:
                dfAll_cv.fillna(config.MISSING_VALUE_NUMERIC, inplace=True)
                dfTrain_cv = dfAll_cv.iloc[:TRAIN_SIZE].copy()
                X_tmp = dfTrain_cv.drop(["id","relevance"], axis=1).values.astype(float)
                if run == 1:
                    self.X_train_cv = np.zeros((X_tmp.shape[0], X_tmp.shape[1], self.n_iter), dtype=float)
                self.X_train_cv[:,:,run-1] = X_tmp
        if feat_cv_cnt == 0:
            self.X_train_cv = None
            self.basic_only = 1

        # report final results
        if self.basic_only:
            self.logger.info("Overall Shape: %d x %d"%(len(self.y_train), self.X_train.shape[1]))
        else:
            self.logger.info("Overall Shape: %d x %d"%(
                len(self.y_train), self.X_train.shape[1]+self.X_train_cv_all.shape[1])) 
        self.logger.info("Done combinning.")

        return self
def main():
    logname = "generate_feature_basic_%s.log" % time_utils._timestamp()
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED)

    ## basic
    generators = [DocId, DocLen, DocFreq, DocEntropy, DigitCount, DigitRatio]
    obs_fields = [
        "search_term", "product_title", "product_description",
        "product_attribute", "product_brand", "product_color"
    ]
    for generator in generators:
        param_list = []
        sf = StandaloneFeatureWrapper(generator, dfAll, obs_fields, param_list,
                                      config.FEAT_DIR, logger)
        sf.go()

    ## for product_uid
    generators = [
        DocIdEcho, DocFreq, ProductUidDummy1, ProductUidDummy2,
        ProductUidDummy3
    ]
    obs_fields = ["product_uid"]
    for generator in generators:
        param_list = []
        sf = StandaloneFeatureWrapper(generator, dfAll, obs_fields, param_list,
                                      config.FEAT_DIR, logger)
        sf.go()

    ## unique count
    generators = [UniqueCount_Ngram, UniqueRatio_Ngram]
    obs_fields = [
        "search_term", "product_title", "product_description",
        "product_attribute", "product_brand", "product_color"
    ]
    ngrams = [1, 2, 3]
    for generator in generators:
        for ngram in ngrams:
            param_list = [ngram]
            sf = StandaloneFeatureWrapper(generator, dfAll, obs_fields,
                                          param_list, config.FEAT_DIR, logger)
            sf.go()

    ## for product_attribute_list
    generators = [
        AttrCount,
        AttrBulletCount,
        AttrBulletRatio,
        AttrNonBulletCount,
        AttrNonBulletRatio,
        AttrHasProductHeight,
        AttrHasProductWidth,
        AttrHasProductLength,
        AttrHasProductDepth,
        AttrHasIndoorOutdoor,
    ]
    obs_fields = ["product_attribute_list"]
    for generator in generators:
        param_list = []
        sf = StandaloneFeatureWrapper(generator, dfAll, obs_fields, param_list,
                                      config.FEAT_DIR, logger)
        sf.go()
def main():
    logname = "generate_feature_group_distance_stat_%s.log"%time_utils._timestamp()
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED_STEMMED)
    y_train = dfAll["relevance"].values[:TRAIN_SIZE]

    group_id_names = ["DocId_search_term", "DocId_product_title", "DocIdEcho_product_uid"]

    match_list = [
    "MatchQueryCount",
    "MatchQueryRatio",
    "LongestMatchRatio",
    ]

    tfidf_list = [
    "StatCoocTF_Unigram_Mean", 
    "StatCoocTF_Unigram_Max",
    "StatCoocTF_Unigram_Min",
    # "StatCoocNormTF_Unigram_Mean", 
    # "StatCoocNormTF_Unigram_Max",
    # "StatCoocNormTF_Unigram_Min", 
    "StatCoocTFIDF_Unigram_Mean",
    "StatCoocTFIDF_Unigram_Max",
    "StatCoocTFIDF_Unigram_Min",
    "StatCoocBM25_Unigram_Mean",
    "StatCoocBM25_Unigram_Max",
    "StatCoocBM25_Unigram_Min",
    # "StatCoocTF_Bigram_Mean", 
    # "StatCoocTF_Bigram_Max",
    # "StatCoocTF_Bigram_Min",
    # "StatCoocNormTF_Bigram_Mean", 
    # "StatCoocNormTF_Bigram_Max",
    # "StatCoocNormTF_Bigram_Min",
    # "StatCoocTFIDF_Bigram_Mean",
    # "StatCoocTFIDF_Bigram_Max",
    # "StatCoocTFIDF_Bigram_Min",
    # "StatCoocBM25_Bigram_Mean",
    # "StatCoocBM25_Bigram_Max",
    # "StatCoocBM25_Bigram_Min",
    # "StatCoocTF_Trigram_Mean", 
    # "StatCoocTF_Trigram_Max",
    # "StatCoocTF_Trigram_Min",
    # "StatCoocNormTF_Trigram_Mean", 
    # "StatCoocNormTF_Trigram_Max",
    # "StatCoocNormTF_Trigram_Min", 
    # "StatCoocTFIDF_Trigram_Mean",
    # "StatCoocTFIDF_Trigram_Max",
    # "StatCoocTFIDF_Trigram_Min",
    # "StatCoocBM25_Trigram_Mean",
    # "StatCoocBM25_Trigram_Max",
    # "StatCoocBM25_Trigram_Min",
    ]
    intersect_ngram_count_list = [    
    "IntersectCount_Unigram", 
    "IntersectRatio_Unigram", 
    # "IntersectCount_Bigram", 
    # "IntersectRatio_Bigram", 
    # "IntersectCount_Trigram", 
    # "IntersectRatio_Trigram", 
    ]
    first_last_ngram_list = [
    "FirstIntersectCount_Unigram", 
    "FirstIntersectRatio_Unigram", 
    "LastIntersectCount_Unigram", 
    "LastIntersectRatio_Unigram",
    # "FirstIntersectCount_Bigram", 
    # "FirstIntersectRatio_Bigram", 
    # "LastIntersectCount_Bigram", 
    # "LastIntersectRatio_Bigram",
    # "FirstIntersectCount_Trigram", 
    # "FirstIntersectRatio_Trigram", 
    # "LastIntersectCount_Trigram", 
    # "LastIntersectRatio_Trigram",
    ]

    cooccurrence_ngram_count_list = [
    "CooccurrenceCount_Unigram", 
    "CooccurrenceRatio_Unigram", 
    # "CooccurrenceCount_Bigram", 
    # "CooccurrenceRatio_Bigram",
    # "CooccurrenceCount_Trigram", 
    # "CooccurrenceRatio_Trigram",
    ]

    ngram_jaccard_list = [
    "JaccardCoef_Unigram", 
    # "JaccardCoef_Bigram", 
    # "JaccardCoef_Trigram", 
    "DiceDistance_Unigram", 
    # "DiceDistance_Bigram", 
    # "DiceDistance_Trigram", 
    ]

    char_dist_sim_list = [
    "CharDistribution_CosineSim",
    "CharDistribution_KL",
    ]

    tfidf_word_ngram_cosinesim_list = [
    "TFIDF_Word_Unigram_CosineSim",
    # "TFIDF_Word_Bigram_CosineSim",
    # "TFIDF_Word_Trigram_CosineSim",
    ]
    tfidf_char_ngram_cosinesim_list = [
    # "TFIDF_Char_Bigram_CosineSim",
    # "TFIDF_Char_Trigram_CosineSim",
    "TFIDF_Char_Fourgram_CosineSim",
    # "TFIDF_Char_Fivegram_CosineSim",
    ]

    lsa_word_ngram_cosinesim_list = [
    "LSA100_Word_Unigram_CosineSim",
    # "LSA100_Word_Bigram_CosineSim",
    # "LSA100_Word_Trigram_CosineSim",
    ]
    lsa_char_ngram_cosinesim_list = [
    # "LSA100_Char_Bigram_CosineSim",
    # "LSA100_Char_Trigram_CosineSim",
    "LSA100_Char_Fourgram_CosineSim",
    # "LSA100_Char_Fivegram_CosineSim",
    ]

    doc2vec_list = [
    "Doc2Vec_Homedepot_D100_CosineSim",
    ]

    word2vec_list = [
    "Word2Vec_N_Similarity",
    "Word2Vec_Homedepot_D100_CosineSim_Mean_Mean",
    "Word2Vec_Homedepot_D100_CosineSim_Max_Mean",
    "Word2Vec_Homedepot_D100_CosineSim_Min_Mean",
    ]

    distance_generator_list = \
    match_list + \
    tfidf_list + \
    intersect_ngram_count_list + \
    first_last_ngram_list + \
    cooccurrence_ngram_count_list + \
    ngram_jaccard_list + \
    tfidf_word_ngram_cosinesim_list + \
    tfidf_char_ngram_cosinesim_list + \
    lsa_word_ngram_cosinesim_list + \
    lsa_char_ngram_cosinesim_list + \
    char_dist_sim_list + \
    word2vec_list + \
    doc2vec_list

    obs_fields_list = []
    target_fields_list = []
    ## query in document
    obs_fields_list.append( ["search_term"] )
    target_fields_list.append( ["product_title", "product_title_product_name"] )
    aggregation_mode = ["mean", "max", "min"]
    for group_id_name in group_id_names:
        group_id_list = pkl_utils._load(os.path.join(config.FEAT_DIR, group_id_name+"_1D.pkl"))
        for distance_generator in distance_generator_list:
            for obs_fields, target_fields in zip(obs_fields_list, target_fields_list):
                for obs_field in obs_fields:
                    for target_field in target_fields:
                        dist_name = "%s_%s_x_%s"%(distance_generator, obs_field, target_field)
                        try:
                            dist_list = pkl_utils._load(os.path.join(config.FEAT_DIR, dist_name+"_1D.pkl"))
                            ext = GroupDistanceStat(dist_list, group_id_list, dist_name, group_id_name, aggregation_mode)
                            x = ext.transform()
                            if isinstance(ext.__name__(), list):
                                for i,feat_name in enumerate(ext.__name__()):
                                    dim = 1
                                    fname = "%s_%dD"%(feat_name, dim)
                                    pkl_utils._save(os.path.join(config.FEAT_DIR, fname+config.FEAT_FILE_SUFFIX), x[:,i])
                                    corr = np_utils._corr(x[:TRAIN_SIZE,i], y_train)
                                    logger.info("%s (%dD): corr = %.6f"%(fname, dim, corr))
                        except:
                            logger.info("Skip %s"%dist_name)
                            pass
Exemple #42
0
def main():

    ###########
    ## Setup ##
    ###########
    logname = "data_processor_%s.log" % now
    logger = logging_utils._get_logger(config.LOG_DIR, logname)

    # put product_attribute_list, product_attribute and product_description first as they are
    # quite time consuming to process
    columns_to_proc = [
        "question1",
        "question2",
    ]
    if config.PLATFORM == "Linux":
        config.DATA_PROCESSOR_N_JOBS = len(columns_to_proc)

    # clean using a list of processors
    processors = [
        UnicodeConverter(),
        LowerCaseConverter(),
        # See LowerUpperCaseSplitter and UnitConverter for why we put UnitConverter here
        UnitConverter(),
        LowerUpperCaseSplitter(),
        # WordReplacer(replace_fname=config.WORD_REPLACER_DATA),
        LetterLetterSplitter(),
        DigitLetterSplitter(),
        DigitCommaDigitMerger(),
        NumberDigitMapper(),
        UnitConverter(),
        QuartetCleaner(),
        HtmlCleaner(parser="html.parser"),
        Lemmatizer(),
    ]
    stemmers = [
        Stemmer(stemmer_type="snowball"),
        Stemmer(stemmer_type="porter")
    ][0:1]

    ## simple test
    text = "1/2 inch rubber lep tips Bullet07"
    print("Original:")
    print(text)
    list_processor = ListProcessor(processors)
    print("After:")
    print(list_processor.process([text]))

    #############
    ## Process ##
    #############
    ## load raw data
    dfAll = pkl_utils._load(config.ALL_DATA_RAW)
    columns_to_proc = [col for col in columns_to_proc if col in dfAll.columns]

    ## clean uisng a list of processors
    df_processor = DataFrameParallelProcessor(processors,
                                              config.DATA_PROCESSOR_N_JOBS)
    df_processor.process(dfAll, columns_to_proc)
    # save data
    logger.info("Save to %s" % config.ALL_DATA_LEMMATIZED)
    pkl_utils._save(config.ALL_DATA_LEMMATIZED, dfAll)

    ## clean using stemmers
    df_processor = DataFrameParallelProcessor(stemmers,
                                              config.DATA_PROCESSOR_N_JOBS)
    df_processor.process(dfAll, columns_to_proc)
    # save data
    logger.info("Save to %s" % config.ALL_DATA_LEMMATIZED_STEMMED)
    pkl_utils._save(config.ALL_DATA_LEMMATIZED_STEMMED, dfAll)
Exemple #43
0
    def __init__(self, model_name, runs, params_dict, logger):
        print("Loading data...")
        words, positions, heads, tails, labels = pkl_utils._load(
            config.GROUPED_TRAIN_DATA)
        words_test, positions_test, heads_test, tails_test, labels_test = pkl_utils._load(
            config.GROUPED_TEST_DATA)  # noqa

        self.embedding = embedding_utils.Embedding(
            config.EMBEDDING_DATA,
            list([s for bags in words for s in bags]) +
            list([s for bags in words_test for s in bags]),
            config.MAX_DOCUMENT_LENGTH)

        print("Preprocessing data...")
        textlen = np.array([[self.embedding.len_transform(x) for x in y]
                            for y in words])
        words = np.array([[self.embedding.text_transform(x) for x in y]
                          for y in words])
        positions = np.array(
            [[self.embedding.position_transform(x) for x in y]
             for y in positions])

        textlen_test = np.array([[self.embedding.len_transform(x) for x in y]
                                 for y in words_test])
        words_test = np.array([[self.embedding.text_transform(x) for x in y]
                               for y in words_test])
        positions_test = np.array(
            [[self.embedding.position_transform(x) for x in y]
             for y in positions_test])  # noqa

        ss = ShuffleSplit(n_splits=1,
                          test_size=0.1,
                          random_state=config.RANDOM_SEED)
        for train_index, valid_index in ss.split(np.zeros(len(labels)),
                                                 labels):
            words_train, words_valid = words[train_index], words[valid_index]
            textlen_train, textlen_valid = textlen[train_index], textlen[
                valid_index]
            positions_train, positions_valid = positions[
                train_index], positions[valid_index]
            heads_train, heads_valid = heads[train_index], heads[valid_index]
            tails_train, tails_valid = tails[train_index], tails[valid_index]
            labels_train, labels_valid = labels[train_index], labels[
                valid_index]
        if "hrere" in model_name:
            self.full_set = list(
                zip(words, textlen, positions, heads, tails, labels))
            self.train_set = list(
                zip(words_train, textlen_train, positions_train, heads_train,
                    tails_train, labels_train))  # noqa
            self.valid_set = list(
                zip(words_valid, textlen_valid, positions_valid, heads_valid,
                    tails_valid, labels_valid))  # noqa
            self.test_set = list(
                zip(words_test, textlen_test, positions_test, heads_test,
                    tails_test, labels_test))  # noqa
            if "complex" in model_name:
                self.entity_embedding1 = np.load(config.ENTITY_EMBEDDING1)
                self.entity_embedding2 = np.load(config.ENTITY_EMBEDDING2)
                self.relation_embedding1 = np.load(config.RELATION_EMBEDDING1)
                self.relation_embedding2 = np.load(config.RELATION_EMBEDDING2)
            else:
                self.entity_embedding = np.load(config.ENTITY_EMBEDDING)
                self.relation_embedding = np.load(config.RELATION_EMBEDDING)
        else:
            self.full_set = list(zip(words, textlen, positions, labels))
            self.train_set = list(
                zip(words_train, textlen_train, positions_train,
                    labels_train))  # noqa
            self.valid_set = list(
                zip(words_valid, textlen_valid, positions_valid,
                    labels_valid))  # noqa
            self.test_set = list(
                zip(words_test, textlen_test, positions_test,
                    labels_test))  # noqa

        self.model_name = model_name
        self.runs = runs
        self.params_dict = params_dict
        self.hparams = AttrDict(params_dict)
        self.logger = logger

        self.model = self._get_model()
        self.saver = tf.train.Saver(tf.global_variables())
        checkpoint_dir = os.path.abspath(config.CHECKPOINT_DIR)
        if not os.path.exists(checkpoint_dir):
            os.makedirs(checkpoint_dir)
        self.checkpoint_prefix = os.path.join(checkpoint_dir, self.__str__())
def main():
    ### 1. Record Time
    now = time_utils._timestamp()
    ###########
    ## Setup ##
    ###########
    logname = f'data_processor_{now}.log'
    logger = logging_utils._get_logger(config.LOG_DIR, logname)

    # Put product_attribute_list, product_attribute and product_description first as they are
    # quite time consuming to process.
    # Choose the columns by check data_preparer.ipynb. In the end, the notebook will show the clean data frame.
    columns_to_proc = [
        # # product_attribute_list is very time consuming to process
        # # so we just process product_attribute which is of the form
        # # attr_name1 | attr_value1 | attr_name2 | attr_value2 | ...
        # # and split it into a list afterwards
        # 'product_attribute_list',
        'product_attribute_concat',
        'product_description',
        'product_brand',
        'product_color',
        'product_title',
        'search_term',
    ]
    if config.PLATFORM == 'Linux':
        config.DATA_PROCESSOR_N_JOBS = len(columns_to_proc)

    # clean using a list of processors
    processors = [
        LowerCaseConverter(),
        # See LowerUpperCaseSplitter and UnitConverter for why we put UnitConverter here
        # 其實沒差,除非能處理掉數字加介係詞 in 的狀況不被替代成單位 in.(inch)
        UnitConverter(),
        LowerUpperCaseSplitter(),
        WordReplacer(replace_fname=config.WORD_REPLACER_DATA),
        LetterLetterSplitter(),
        DigitLetterSplitter(),
        DigitCommaDigitMerger(),
        NumberDigitMapper(),
        UnitConverter(),
        QuartetCleaner(),
        HtmlCleaner(parser='html.parser'),
        Lemmatizer(),
    ]
    stemmers = [
        Stemmer(stemmer_type='snowball'),
        Stemmer(stemmer_type='porter')
    ][0:1]  # means only use Stemmer(stemmer_type='snowball')

    ## simple test
    text = '1/2 inch rubber lep tips Bullet07'
    print('Original:')
    print(text)
    list_processor = ListProcessor(processors)
    print('After:')
    print(list_processor.process([text]))

    #############
    ## Process ##
    #############
    ## load raw data
    dfAll = pkl_utils._load(config.ALL_DATA_RAW)
    columns_to_proc = [col for col in columns_to_proc if col in dfAll.columns]

    if config.TASK == 'sample':
        dfAll = dfAll.iloc[0:config.SAMPLE_SIZE]
        print(f'data length: {len(dfAll)}')

    ## extract product name from search_term and product_title
    ext = ProductNameExtractor()
    dfAll['search_term_product_name'] = dfAll['search_term'].apply(
        ext.transform)
    dfAll['product_title_product_name'] = dfAll['product_title'].apply(
        ext.transform)
    if config.TASK == 'sample':
        print(dfAll[[
            'search_term', 'search_term_product_name',
            'product_title_product_name'
        ]])

    ## clean using GoogleQuerySpellingChecker(Chenglong team not used in final submission)
    # MUST BE IN FRONT OF ALL THE PROCESSING
    if config.GOOGLE_CORRECTING_QUERY:
        logger.info('Run GoogleQuerySpellingChecker at search_term')
        checker = GoogleQuerySpellingChecker()
        dfAll['search_term'] = dfAll['search_term'].apply(checker.correct)

    ## clean uisng a list of processors
    df_processor = DataFrameParallelProcessor(processors,
                                              config.DATA_PROCESSOR_N_JOBS)
    df_processor.process(dfAll, columns_to_proc)
    # split product_attribute_concat into product_attribute and product_attribute_list
    dfAll['product_attribute'] = dfAll['product_attribute_concat'].apply(
        _split_attr_to_text)
    dfAll['product_attribute_list'] = dfAll['product_attribute_concat'].apply(
        _split_attr_to_list)
    if config.TASK == 'sample':
        print(dfAll[['product_attribute', 'product_attribute_list']])

    # query expansion (Chenglong team decided to remove the feature which might be a major cause of overfitting.)
    if config.QUERY_EXPANSION:
        list_processor = ListProcessor(processors)
        # stop words must to access data process. EX. NumberDigitMapper function will replace 'one' to '1'.
        # So, if stop word has 'one', it must replace to '1',too.
        base_stopwords = set(list_processor.process(list(
            config.STOP_WORDS)))  # a set of stop word
        qe = QueryExpansion(dfAll,
                            ngram=3,
                            stopwords_threshold=0.9,
                            base_stopwords=base_stopwords)
        dfAll['search_term_alt'] = qe.build()
        if config.TASK == 'sample':
            print(dfAll[['search_term', 'search_term_alt']])

    # save data
    logger.info(f'Save to {config.ALL_DATA_LEMMATIZED}')
    columns_to_save = [
        col for col in dfAll.columns if col != 'product_attribute_concat'
    ]
    pkl_utils._save(config.ALL_DATA_LEMMATIZED, dfAll[columns_to_save])

    ## auto correcting query(Chenglong team not used in final submission)
    if config.AUTO_CORRECTING_QUERY:
        logger.info('Run AutoSpellingChecker at search_term')
        checker = AutoSpellingChecker(dfAll,
                                      exclude_stopwords=False,
                                      min_len=4)
        dfAll['search_term_auto_corrected'] = list(dfAll['search_term'].apply(
            checker.correct))
        columns_to_proc += ['search_term_auto_corrected']
        if config.TASK == 'sample':
            print(dfAll[['search_term', 'search_term_auto_corrected']])
        # save query_correction_map and spelling checker
        fname = '%s/auto_spelling_checker_query_correction_map_%s.log' % (
            config.LOG_DIR, now)
        checker.save_query_correction_map(fname)
        # save data
        logger.info('Save to %s' % config.ALL_DATA_LEMMATIZED)
        columns_to_save = [
            col for col in dfAll.columns if col != 'product_attribute_concat'
        ]
        pkl_utils._save(config.ALL_DATA_LEMMATIZED, dfAll[columns_to_save])

    ## clean using stemmers
    df_processor = DataFrameParallelProcessor(stemmers,
                                              config.DATA_PROCESSOR_N_JOBS)
    df_processor.process(dfAll, columns_to_proc)
    # split product_attribute_concat into product_attribute and product_attribute_list
    dfAll['product_attribute'] = dfAll['product_attribute_concat'].apply(
        _split_attr_to_text)
    dfAll['product_attribute_list'] = dfAll['product_attribute_concat'].apply(
        _split_attr_to_list)

    # query expansion
    if config.QUERY_EXPANSION:
        list_processor = ListProcessor(stemmers)
        base_stopwords = set(list_processor.process(list(config.STOP_WORDS)))
        qe = QueryExpansion(dfAll,
                            ngram=3,
                            stopwords_threshold=0.9,
                            base_stopwords=base_stopwords)
        dfAll['search_term_alt'] = qe.build()
        if config.TASK == 'sample':
            print(dfAll[['search_term', 'search_term_alt']])

    # save data
    logger.info('Save to %s' % config.ALL_DATA_LEMMATIZED_STEMMED)
    columns_to_save = [
        col for col in dfAll.columns if col != 'product_attribute_concat'
    ]
    pkl_utils._save(config.ALL_DATA_LEMMATIZED_STEMMED, dfAll[columns_to_save])
Exemple #45
0
from sklearn.cross_validation import train_test_split
import config
from utils import pkl_utils

combine_flag = False
suffix = 'v4'
threshold = 0.05
if combine_flag:
    cmd = "python get_feature_conf_magic.py -l 5 -m 44 -o feature_conf_magic_%s.py"%suffix
    os.system(cmd)
    cmd = "python feature_combiner.py -l 1 -c feature_conf_magic_%s -n basic_magic_%s -t %.6f"%(suffix, suffix, threshold)
    os.system(cmd)
    
feature_name = "basic_magic_{}".format(suffix)
fname = os.path.join(config.FEAT_DIR+"/Combine", feature_name+config.FEAT_FILE_SUFFIX)
data_dict = pkl_utils._load(fname)
X_train = pd.DataFrame(data_dict["X_train_basic"], columns = data_dict["feature_names"])
X_test = pd.DataFrame(data_dict["X_test"], columns = data_dict["feature_names"])
y_train = data_dict["y_train"]


X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.1, random_state=4242)
#UPDownSampling
pos_train = X_train[y_train == 1]
neg_train = X_train[y_train == 0]
X_train = pd.concat((neg_train, pos_train.iloc[:int(0.8*len(pos_train))], neg_train))
y_train = np.array([0] * neg_train.shape[0] + [1] * pos_train.iloc[:int(0.8*len(pos_train))].shape[0] + [0] * neg_train.shape[0])
print(np.mean(y_train))
del pos_train, neg_train

pos_valid = X_valid[y_valid == 1]
Exemple #46
0
def get_types(model_name, input_file, dev_file, output_file, options):

    checkpoint_file = os.path.join(config.CHECKPOINT_DIR, model_name)
    type2id, typeDict = pkl_utils._load(config.WIKI_TYPE)
    id2type = {type2id[x]: x for x in type2id.keys()}

    #different way? -> data is different!
    # words, mentions, positions, labels = data_utils.load(input_file)
    # n = len(words)

    embedding = embedding_utils.Embedding.restore(checkpoint_file)

    test_set, test_labels, test_tokenized = create_labelset_input(
        *data_utils.load(input_file), embedding)
    dev_set, dev_labels, dev_tokenized = create_labelset_input(
        *data_utils.load(dev_file), embedding)

    store = StructuredLogitsStore(
        model_name,
        idx2label=id2type,
        hierarchical=True if "hier" in model_name else False,
        nested=False)

    graph = tf.Graph()
    with graph.as_default():
        sess = tf.Session()
        saver = tf.train.import_meta_graph("{}.meta".format(checkpoint_file))
        saver.restore(sess, checkpoint_file)

        # DEFINE operations
        input_words = graph.get_operation_by_name("input_words").outputs[0]
        input_textlen = graph.get_operation_by_name("input_textlen").outputs[0]
        input_mentions = graph.get_operation_by_name(
            "input_mentions").outputs[0]
        input_mentionlen = graph.get_operation_by_name(
            "input_mentionlen").outputs[0]
        input_positions = graph.get_operation_by_name(
            "input_positions").outputs[0]
        phase = graph.get_operation_by_name("phase").outputs[0]
        dense_dropout = graph.get_operation_by_name("dense_dropout").outputs[0]
        rnn_dropout = graph.get_operation_by_name("rnn_dropout").outputs[0]

        pred_op = graph.get_operation_by_name("output/predictions").outputs[0]
        #proba_op = graph.get_operation_by_name("output/proba").outputs[0] #proba
        logit_op = graph.get_operation_by_name("output/scores").outputs[
            0]  #proba
        tune_op = graph.get_operation_by_name("tune").outputs[0]  # K x K
        # results_op = graph.get_operation_by_name("results").outputs[0] # require labels

        # DO THE SAME FOR DEV set!

        test_batches = data_utils.batch_iter(test_set, 512, 1, shuffle=False)

        all_predictions = []
        all_logits = []
        for batch in test_batches:
            words_batch, textlen_batch, mentions_batch, mentionlen_batch, positions_batch, labels_batch = zip(
                *batch)
            feed = {
                input_words: words_batch,
                input_textlen: textlen_batch,
                input_mentions: mentions_batch,
                input_mentionlen: mentionlen_batch,
                input_positions: positions_batch,
                phase: False,
                dense_dropout: 1.0,
                rnn_dropout: 1.0
            }
            batch_predictions = sess.run(pred_op, feed_dict=feed)
            all_predictions = np.concatenate(
                [all_predictions, batch_predictions])

            #probas = sess.run(logit_op, feed_dict=feed)
            logit_predictions = sess.run(logit_op, feed_dict=feed)

            if all_logits == []:
                all_logits = logit_predictions
            else:
                all_logits = np.concatenate([all_logits, logit_predictions])

        store.create_labelset(
            StructuredLogits(f_x=all_logits,
                             y_true=test_labels,
                             tokenized=test_tokenized,
                             y_hat=None,
                             probas=None,
                             c=None,
                             document_masks=None,
                             idx2label=id2type), "test")
        store.score_set("test")

        dev_batches = data_utils.batch_iter(dev_set, 512, 1, shuffle=False)

        all_predictions = []
        all_logits = []
        for batch in dev_batches:
            words_batch, textlen_batch, mentions_batch, mentionlen_batch, positions_batch, labels_batch = zip(
                *batch)
            feed = {
                input_words: words_batch,
                input_textlen: textlen_batch,
                input_mentions: mentions_batch,
                input_mentionlen: mentionlen_batch,
                input_positions: positions_batch,
                phase: False,
                dense_dropout: 1.0,
                rnn_dropout: 1.0
            }
            batch_predictions = sess.run(pred_op, feed_dict=feed)
            all_predictions = np.concatenate(
                [all_predictions, batch_predictions])

            #probas = sess.run(logit_op, feed_dict=feed)
            logit_predictions = sess.run(logit_op, feed_dict=feed)

            if all_logits == []:
                all_logits = logit_predictions
            else:
                all_logits = np.concatenate([all_logits, logit_predictions])

        store.create_labelset(
            StructuredLogits(f_x=all_logits,
                             y_true=dev_labels,
                             tokenized=dev_tokenized,
                             y_hat=None,
                             probas=None,
                             c=None,
                             document_masks=None,
                             idx2label=id2type), "dev")
        store.score_set("dev")

        #np.transpose(prior_utils.create_prior(type_info, hparams.alpha)
        # all_logits.append(logit_predictions)

    # save as pickle
    with open(os.path.join(os.path.dirname(checkpoint_file), "logits.pickle"),
              "wb") as f:
        pickle.dump(store, f)
    """     
Exemple #47
0
def get_types(model_name, input_file, output_file):
	checkpoint_file = os.path.join(config.CHECKPOINT_DIR, model_name)
	type2id, typeDict = pkl_utils._load(config.WIKI_TYPE)
	id2type = {type2id[x]:x for x in type2id.keys()}

	df = pd.read_csv(input_file, sep="\t", names=["r", "e1", "x1", "y1", "e2", "x2", "y2", "s"]) 
	n = df.shape[0]
	words1 = np.array(df.s)
	mentions1 = np.array(df.e1)
	positions1 = np.array([[x, y] for x, y in zip(df.x1, df.y1+1)])
	words2 = np.array(df.s)
	mentions2 = np.array(df.e2)
	positions2 = np.array([[x, y] for x, y in zip(df.x2, df.y2+1)])
	
	words = np.concatenate([words1, words2])
	mentions = np.concatenate([mentions1, mentions2])
	positions = np.concatenate([positions1, positions2])

	embedding = embedding_utils.Embedding.restore(checkpoint_file)

	textlen = np.array([embedding.len_transform1(x) for x in words])
	words = np.array([embedding.text_transform1(x) for x in words])
	mentionlen = np.array([embedding.len_transform2(x) for x in mentions])
	mentions = np.array([embedding.text_transform2(x) for x in mentions])
	positions = np.array([embedding.position_transform(x) for x in positions])
	labels = np.zeros(2*n)
	test_set = list(zip(words, textlen, mentions, mentionlen, positions, labels))

	graph = tf.Graph()
	with graph.as_default():
		sess = tf.Session()
		saver = tf.train.import_meta_graph("{}.meta".format(checkpoint_file))
		saver.restore(sess, checkpoint_file)

		input_words = graph.get_operation_by_name("input_words").outputs[0]
		input_textlen = graph.get_operation_by_name("input_textlen").outputs[0]
		input_mentions = graph.get_operation_by_name("input_mentions").outputs[0]
		input_mentionlen = graph.get_operation_by_name("input_mentionlen").outputs[0]
		input_positions = graph.get_operation_by_name("input_positions").outputs[0]
		phase = graph.get_operation_by_name("phase").outputs[0]
		dense_dropout = graph.get_operation_by_name("dense_dropout").outputs[0]
		rnn_dropout = graph.get_operation_by_name("rnn_dropout").outputs[0]

		pred_op = graph.get_operation_by_name("output/predictions").outputs[0]
		batches = data_utils.batch_iter(test_set, 512, 1, shuffle=False)
		all_predictions = []
		for batch in batches:
			words_batch, textlen_batch, mentions_batch, mentionlen_batch, positions_batch, labels_batch = zip(*batch)
			feed = {
				input_words: words_batch,
				input_textlen: textlen_batch,
				input_mentions: mentions_batch,
				input_mentionlen: mentionlen_batch,
				input_positions: positions_batch,
				phase: False,
				dense_dropout: 1.0,
				rnn_dropout: 1.0
			}
			batch_predictions = sess.run(pred_op, feed_dict=feed)
			all_predictions = np.concatenate([all_predictions, batch_predictions])
	
	df["t1"] = all_predictions[:n]
	df["t2"] = all_predictions[n:]
	df["t1"] = df["t1"].map(id2type)
	df["t2"] = df["t2"].map(id2type)
	df.to_csv(output_file, sep="\t", header=False, index=False)
Exemple #48
0
 def _load_data_dict(self):
     fname = os.path.join(config.FEAT_DIR+"/Combine", self.feature_name + config.FEAT_FILE_SUFFIX)
     data_dict = pkl_utils._load(fname)
     return  data_dict
Exemple #49
0
def preprocess_new(data_name, if_clean=False, full_path=False):
    if data_name == "wiki":
        raw_all_file = config.WIKI_ALL
        raw_train_file = config.WIKI_TRAIN
        raw_test_file = config.WIKI_TEST
        clean_train_file = config.WIKI_TRAIN_CLEAN
        clean_test_file = config.WIKI_TEST_CLEAN
        type_file = config.WIKI_TYPE
        raw_valid_file = config.WIKI_VALID
    elif data_name == "ontonotes":
        raw_all_file = config.ONTONOTES_ALL
        raw_train_file = config.ONTONOTES_TRAIN
        raw_test_file = config.ONTONOTES_TEST
        clean_train_file = config.ONTONOTES_TRAIN_CLEAN
        clean_test_file = config.ONTONOTES_TEST_CLEAN
        type_file = config.ONTONOTES_TYPE
        raw_valid_file = config.ONTONOTES_VALID
    elif data_name == "bbn":
        raw_all_file = config.BBN_ALL
        raw_train_file = config.BBN_TRAIN
        raw_test_file = config.BBN_TEST
        raw_valid_file = config.BBN_VALID
        clean_train_file = config.BBN_TRAIN_CLEAN
        clean_test_file = config.BBN_TEST_CLEAN
        type_file = config.BBN_TYPE
    else:
        raise AttributeError("Invalid data name!")

    if not os.path.exists(type_file):
        create_type_dict_new(raw_all_file, type_file, full_path)
    type2id, typeDict = pkl_utils._load(type_file)

    data_train = json.load(open(raw_train_file))
    data_valid = json.load(open(raw_valid_file))
    data_test = json.load(open(raw_test_file))

    data = data_train + data_valid
    size = len(data)
    outfile = open(clean_train_file, "w")
    for i in range(size):
        for j in range(len(data[i]["mentions"])):
            p1 = data[i]["mentions"][j]["start"]
            p2 = data[i]["mentions"][j]["end"]
            types = data[i]["mentions"][j]["labels"]
            if (not path_count(types) == 1) and if_clean:
                continue

            tokens = [clear_text(txt) for txt in data[i]["tokens"]]
            if p1 >= len(tokens):
                continue
            mention = " ".join(tokens[p1:p2])

            if p1 == 0:
                mention = "<PAD> " + mention
            else:
                mention = tokens[p1 - 1] + " " + mention
            if p2 >= len(tokens):
                mention = mention + " <PAD>"
            else:
                mention = mention + " " + tokens[p2]

            offset = max(0, p1 - config.WINDOW_SIZE)
            text = " ".join(tokens[offset:min(len(tokens), p2 + config.WINDOW_SIZE - 1)])
            p1 -= offset
            p2 -= offset

            out_type = []
            for a in types:
                flag = True
                for b in types:
                    if len(a) >= len(b):
                        continue
                    if (a == b[:len(a)]) and (b[len(a)] == "/"):
                        flag = False
                if flag:
                    out_type.append(a)

            if len(out_type) > 0:
                if full_path:
                    try:
                        outfile.write("%d\t%d\t%s\t%s\t%s\n" % (p1, p2, text, mention, " ".join(types)))
                    except:
                        continue
                else:
                    try:
                        outfile.write("%d\t%d\t%s\t%s\t%s\n" % (p1, p2, text, mention, " ".join(out_type)))
                    except:
                        continue
    outfile.close()

    outfile = open(clean_test_file, "w")
    size = len(data_test)
    for i in range(size):
        for j in range(len(data_test[i]["mentions"])):
            p1 = data_test[i]["mentions"][j]["start"]
            p2 = data_test[i]["mentions"][j]["end"]
            types = data_test[i]["mentions"][j]["labels"]

            tokens = [clear_text(txt) for txt in data_test[i]["tokens"]]
            if p1 >= len(tokens):
                continue
            mention = " ".join(tokens[p1:p2])

            if p1 == 0:
                mention = "<PAD> " + mention
            else:
                mention = tokens[p1 - 1] + " " + mention
            if p2 >= len(tokens):
                mention = mention + " <PAD>"
            else:
                mention = mention + " " + tokens[p2]

            offset = max(0, p1 - config.WINDOW_SIZE)
            text = " ".join(tokens[offset:min(len(tokens), p2 + config.WINDOW_SIZE - 1)])
            p1 -= offset
            p2 -= offset

            out_type = []
            for a in types:
                flag = True
                for b in types:
                    if len(a) >= len(b):
                        continue
                    if (a == b[:len(a)]) and (b[len(a)] == "/"):
                        flag = False
                if flag:
                    out_type.append(a)

            if full_path:
                try:
                    outfile.write("%d\t%d\t%s\t%s\t%s\n" % (p1, p2, text, mention, " ".join(types)))
                except:
                    continue
            else:
                try:
                    outfile.write("%d\t%d\t%s\t%s\t%s\n" % (p1, p2, text, mention, " ".join(out_type)))
                except:
                    continue
    outfile.close()
Exemple #50
0
import config
import pandas as pd
from utils import time_utils, pkl_utils
from optparse import OptionParser
from collections import defaultdict
import os.path
import parse

G = pkl_utils._load(config.ONTOLOGY_TREE)
Root = "http://www.w3.org/2002/07/owl#Thing"


def parse_args(parser):
    parser.add_option("-l",
                      "--lang",
                      default="zh",
                      type="string",
                      dest="lang",
                      help="target language")
    parser.add_option("-p",
                      "--pivot",
                      default="en",
                      type="string",
                      dest="pivots",
                      help="pivot lanuages")
    parser.add_option(
        "-L",
        default=0.5,
        type="float",
        dest="L",
        help="parameter to tune the tradeoff between precision and recall")
    def combine(self):

        dfAll = pkl_utils._load(config.INFO_DATA)
        dfAll_raw = dfAll.copy()
        y_train = dfAll["relevance"].values[:TRAIN_SIZE]

        ## for basic features
        feat_cnt = 0
        self.logger.info("Run for basic...")
        for file_name in sorted(os.listdir(config.FEAT_DIR)):
            if self.feature_suffix in file_name:
                fname = file_name.split(".")[0]
                if fname not in self.feature_dict:
                    continue
                x = self.load_feature(config.FEAT_DIR, fname)
                x = np.nan_to_num(x)
                if np.isnan(x).any():
                    self.logger.info("%s nan" % fname)
                    continue
                # apply feature transform
                mandatory = self.feature_dict[fname][0]
                transformer = self.feature_dict[fname][1]
                x = transformer.fit_transform(x)
                dim = np_utils._dim(x)
                if dim == 1:
                    corr = np_utils._corr(x[:TRAIN_SIZE], y_train)
                    if not mandatory and abs(corr) < self.corr_threshold:
                        self.logger.info(
                            "Drop: {} ({}D) (abs corr = {}, < threshold = {})".
                            format(fname, dim, abs(corr), self.corr_threshold))
                        continue
                    dfAll[fname] = x
                    self.feature_names.append(fname)
                else:
                    columns = ["%s_%d" % (fname, x) for x in range(dim)]
                    df = pd.DataFrame(x, columns=columns)
                    dfAll = pd.concat([dfAll, df], axis=1)
                    self.feature_names.extend(columns)
                feat_cnt += 1
                self.feature_names_basic.append(fname)
                if dim == 1:
                    self.logger.info(
                        "Combine {:>3}/{:>3} feat: {} ({}D) (corr = {})".
                        format(feat_cnt, len(self.feature_dict.keys()), fname,
                               dim, corr))
                else:
                    self.logger.info(
                        "Combine {:>3}/{:>3} feat: {} ({}D)".format(
                            feat_cnt, len(self.feature_dict.keys()), fname,
                            dim))
        dfAll.fillna(config.MISSING_VALUE_NUMERIC, inplace=True)
        ## basic
        dfTrain = dfAll.iloc[:TRAIN_SIZE].copy()
        self.y_train = dfTrain["relevance"].values.astype(float)
        dfTrain.drop(["id", "relevance"], axis=1, inplace=True)
        self.X_train = dfTrain.values.astype(float)

        dfTest = dfAll.iloc[TRAIN_SIZE:].copy()
        self.id_test = dfTest["id"].values.astype(int)
        dfTest.drop(["id", "relevance"], axis=1, inplace=True)
        self.X_test = dfTest.values.astype(float)

        ## all
        first = True
        feat_cv_cnt = 0
        dfAll_cv_all = dfAll_raw.copy()
        feature_dir = "%s/All" % (config.FEAT_DIR)
        for file_name in sorted(os.listdir(feature_dir)):
            if self.feature_suffix in file_name:
                fname = file_name.split(".")[0]
                if fname not in self.feature_dict:
                    continue
                if first:
                    self.logger.info("Run for all...")
                    first = False
                x = self.load_feature(feature_dir, fname)
                x = np.nan_to_num(x)
                if np.isnan(x).any():
                    self.logger.info("%s nan" % fname)
                    continue
                # apply feature transform
                mandatory = self.feature_dict[fname][0]
                transformer = self.feature_dict[fname][1]
                x = transformer.fit_transform(x)
                dim = np_utils._dim(x)
                if dim == 1:
                    corr = np_utils._corr(x[:TRAIN_SIZE], y_train)
                    if not mandatory and abs(corr) < self.corr_threshold:
                        self.logger.info(
                            "Drop: {} ({}D) (abs corr = {}, < threshold = {})".
                            format(fname, dim, abs(corr), self.corr_threshold))
                        continue
                    dfAll_cv_all[fname] = x
                    self.feature_names.append(fname)
                else:
                    columns = ["%s_%d" % (fname, x) for x in range(dim)]
                    df = pd.DataFrame(x, columns=columns)
                    dfAll_cv_all = pd.concat([dfAll_cv_all, df], axis=1)
                    self.feature_names.extend(columns)
                feat_cv_cnt += 1
                self.feature_names_cv.append(fname)
                if dim == 1:
                    self.logger.info(
                        "Combine {:>3}/{:>3} feat: {} ({}D) (corr = {})".
                        format(feat_cnt + feat_cv_cnt,
                               len(self.feature_dict.keys()), fname, dim,
                               corr))
                else:
                    self.logger.info(
                        "Combine {:>3}/{:>3} feat: {} ({}D)".format(
                            feat_cnt + feat_cv_cnt,
                            len(self.feature_dict.keys()), fname, dim))
        if feat_cv_cnt > 0:
            dfAll_cv_all.fillna(config.MISSING_VALUE_NUMERIC, inplace=True)
            X_tmp = dfAll_cv_all.drop(["id", "relevance"],
                                      axis=1).values.astype(float)
            self.X_train_cv_all = X_tmp[:TRAIN_SIZE]
            self.X_test = np.hstack((self.X_test, X_tmp[TRAIN_SIZE:]))
        else:
            self.X_train_cv_all = None
        feat_cnt += feat_cv_cnt

        ## for cv features
        first = True
        for run in range(1, self.n_iter + 1):
            feat_cv_cnt = 0
            dfAll_cv = dfAll_raw.copy()
            feature_dir = "%s/Run%d" % (config.FEAT_DIR, run)
            for file_name in sorted(os.listdir(feature_dir)):
                if self.feature_suffix in file_name:
                    fname = file_name.split(".")[0]
                    if (fname not in self.feature_dict) or (
                            fname not in self.feature_names_cv):
                        continue
                    if first:
                        self.logger.info("Run for cv...")
                        first = False
                    if feat_cv_cnt == 0:
                        self.logger.info("Run %d" % run)
                    x = self.load_feature(feature_dir, fname)
                    x = np.nan_to_num(x)
                    if np.isnan(x).any():
                        self.logger.info("%s nan" % fname)
                        continue
                    # apply feature transform
                    mandatory = self.feature_dict[fname][0]
                    transformer = self.feature_dict[fname][1]
                    x = transformer.fit_transform(x)
                    dim = np_utils._dim(x)
                    if dim == 1:
                        dfAll_cv[fname] = x
                    else:
                        columns = ["%s_%d" % (fname, x) for x in range(dim)]
                        df = pd.DataFrame(x, columns=columns)
                        dfAll_cv = pd.concat([dfAll_cv, df], axis=1)
                    feat_cv_cnt += 1
                    self.logger.info(
                        "Combine {:>3}/{:>3} feat: {} ({}D)".format(
                            feat_cnt + feat_cv_cnt,
                            len(self.feature_dict.keys()), fname, dim))
            if feat_cv_cnt > 0:
                dfAll_cv.fillna(config.MISSING_VALUE_NUMERIC, inplace=True)
                dfTrain_cv = dfAll_cv.iloc[:TRAIN_SIZE].copy()
                X_tmp = dfTrain_cv.drop(["id", "relevance"],
                                        axis=1).values.astype(float)
                if run == 1:
                    self.X_train_cv = np.zeros(
                        (X_tmp.shape[0], X_tmp.shape[1], self.n_iter),
                        dtype=float)
                self.X_train_cv[:, :, run - 1] = X_tmp
        if feat_cv_cnt == 0:
            self.X_train_cv = None
            self.basic_only = 1

        # report final results
        if self.basic_only:
            self.logger.info("Overall Shape: %d x %d" %
                             (len(self.y_train), self.X_train.shape[1]))
        else:
            self.logger.info("Overall Shape: %d x %d" %
                             (len(self.y_train), self.X_train.shape[1] +
                              self.X_train_cv_all.shape[1]))
        self.logger.info("Done combinning.")

        return self
Exemple #52
0
def main(which):
    logname = "generate_feature_word2vec_%s_%s.log" % (which,
                                                       time_utils._timestamp())
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    #### NOTE: use data BEFORE STEMMinG
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED)

    word2vec_model_dirs = []
    model_prefixes = []
    if which == "homedepot":
        ## word2vec model trained with Homedepot dataset: brand/color/query/title/description
        word2vec_model_dirs.append(
            config.WORD2VEC_MODEL_DIR +
            "/Homedepot-word2vec-D%d-min_count%d.model" %
            (config.EMBEDDING_DIM, config.EMBEDDING_MIN_COUNT))
        model_prefixes.append("Homedepot")
    elif which == "wikipedia":
        ## word2vec model pretrained with Wikipedia+Gigaword 5
        word2vec_model_dirs.append(config.GLOVE_WORD2VEC_MODEL_DIR +
                                   "/glove.6B.300d.txt")
        model_prefixes.append("Wikipedia")
    elif which == "google":
        ## word2vec model pretrained with Google News
        word2vec_model_dirs.append(config.WORD2VEC_MODEL_DIR +
                                   "/GoogleNews-vectors-negative300.bin")
        model_prefixes.append("GoogleNews")
    elif which == "common_crawl":
        ## word2vec model pretrained with Common Crawl
        word2vec_model_dirs.append(config.GLOVE_WORD2VEC_MODEL_DIR +
                                   "/glove.840B.300d.txt")
        model_prefixes.append("CommonCrawl")

    for word2vec_model_dir, model_prefix in zip(word2vec_model_dirs,
                                                model_prefixes):
        ## load model
        try:
            if ".bin" in word2vec_model_dir:
                word2vec_model = gensim.models.Word2Vec.load_word2vec_format(
                    word2vec_model_dir, binary=True)
            elif ".txt" in word2vec_model_dir:
                word2vec_model = gensim.models.Word2Vec.load_word2vec_format(
                    word2vec_model_dir, binary=False)
            else:
                word2vec_model = gensim.models.Word2Vec.load(
                    word2vec_model_dir)
        except:
            continue

        # ## standalone (not used in model building)
        # obs_fields = ["search_term", "product_title", "product_description"]
        # generator = Word2Vec_Centroid_Vector
        # param_list = [word2vec_model, model_prefix]
        # sf = StandaloneFeatureWrapper(generator, dfAll, obs_fields, param_list, config.FEAT_DIR, logger)
        # sf.go()

        ## pairwise
        generators = [
            Word2Vec_Importance,
            Word2Vec_N_Similarity,
            Word2Vec_N_Similarity_Imp,
            Word2Vec_Centroid_RMSE,
            Word2Vec_Centroid_RMSE_IMP,
            # # not used in final submission
            # Word2Vec_Centroid_Vdiff,
        ]
        obs_fields_list = []
        target_fields_list = []
        obs_fields_list.append(["question1"])
        target_fields_list.append(["question2"])
        for obs_fields, target_fields in zip(obs_fields_list,
                                             target_fields_list):
            for generator in generators:
                param_list = [word2vec_model, model_prefix]
                pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields,
                                            target_fields, param_list,
                                            config.FEAT_DIR, logger)
                pf.go()

        ## cosine sim
        generators = [
            Word2Vec_CosineSim,
        ]
        # double aggregation
        aggregation_mode_prev = ["mean", "max", "min", "median"]
        aggregation_mode = ["mean", "std", "max", "min", "median"]
        for obs_fields, target_fields in zip(obs_fields_list,
                                             target_fields_list):
            for generator in generators:
                param_list = [
                    word2vec_model, model_prefix, aggregation_mode,
                    aggregation_mode_prev
                ]
                pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields,
                                            target_fields, param_list,
                                            config.FEAT_DIR, logger)
                pf.go()
 def load_feature(self, feature_dir, feature_name):
     fname = os.path.join(feature_dir, feature_name + self.feature_suffix)
     return pkl_utils._load(fname)
Exemple #54
0
    def __init__(self, model_name, data_name, cv_runs, params_dict, logger):
        print("Loading data...")
        if data_name == "wiki":
            words_train, mentions_train, positions_train, labels_train = data_utils.load(config.WIKI_TRAIN_CLEAN)
            words, mentions, positions, labels = data_utils.load(config.WIKI_TEST_CLEAN)
            type2id, typeDict = pkl_utils._load(config.WIKI_TYPE)
            num_types = len(type2id)
            type_info = config.WIKI_TYPE
        elif data_name == "ontonotes":
            words_train, mentions_train, positions_train, labels_train = data_utils.load(config.ONTONOTES_TRAIN_CLEAN)
            words, mentions, positions, labels = data_utils.load(config.ONTONOTES_TEST_CLEAN)
            type2id, typeDict = pkl_utils._load(config.ONTONOTES_TYPE)
            num_types = len(type2id)
            type_info = config.ONTONOTES_TYPE # "./data/corpus/OntoNotes/type.pkl"
        elif data_name == "wikim":
            words_train, mentions_train, positions_train, labels_train = data_utils.load(config.WIKIM_TRAIN_CLEAN)
            words, mentions, positions, labels = data_utils.load(config.WIKIM_TEST_CLEAN)
            type2id, typeDict = pkl_utils._load(config.WIKIM_TYPE)
            num_types = len(type2id)
            type_info = config.WIKIM_TYPE

        self.id2type = {type2id[x]:x for x in type2id.keys()}
        def type2vec(types):
            tmp = np.zeros(num_types)
            for t in types.split():
                tmp[type2id[t]] = 1.0
            return tmp
        labels_train = np.array([type2vec(t) for t in labels_train]) # one_hot coding
        labels = np.array([type2vec(t) for t in labels]) # labels_test [test_size,num_types]

        self.embedding = embedding_utils.Embedding.fromCorpus(config.EMBEDDING_DATA, list(words_train)+list(words), config.MAX_DOCUMENT_LENGTH, config.MENTION_SIZE)
        # MAX_DOCUMENT_LENGTH = 30
        # MENTION_SIZE = 15
        # WINDOW_SIZE = 10

        print("Preprocessing data...")
        textlen_train = np.array([self.embedding.len_transform1(x) for x in words_train]) #1-D array [total] constraints max sentences len to 30
        words_train = np.array([self.embedding.text_transform1(x) for x in words_train]) # 2-D array [[index,],] [total,30]
        mentionlen_train = np.array([self.embedding.len_transform2(x) for x in mentions_train]) # [total],constrains max mentions len to 15
        mentions_train = np.array([self.embedding.text_transform2(x) for x in mentions_train]) # [total,15]
        positions_train = np.array([self.embedding.position_transform(x) for x in positions_train]) # [total,30]

        textlen = np.array([self.embedding.len_transform1(x) for x in words])
        words = np.array([self.embedding.text_transform1(x) for x in words])
        mentionlen = np.array([self.embedding.len_transform2(x) for x in mentions])
        mentions = np.array([self.embedding.text_transform2(x) for x in mentions])
        positions = np.array([self.embedding.position_transform(x) for x in positions])

        ss = ShuffleSplit(n_splits=1, test_size=0.1, random_state=config.RANDOM_SEED)
        for test_index, valid_index in ss.split(np.zeros(len(labels)), labels):
            textlen_test, textlen_valid = textlen[test_index], textlen[valid_index]
            words_test, words_valid = words[test_index], words[valid_index]
            mentionlen_test, mentionlen_valid = mentionlen[test_index], mentionlen[valid_index]
            mentions_test, mentions_valid = mentions[test_index], mentions[valid_index]
            positions_test, positions_valid = positions[test_index], positions[valid_index]
            labels_test, labels_valid = labels[test_index], labels[valid_index]
                                    # [?,30] [?]    [?,15]   [?]  [?,30] [?,num_types]
         # --> ? total tuples (30,1,15,1,30,num_types)  --> (sentence, len, mention, len, positions, type)
        self.train_set = list(zip(words_train, textlen_train, mentions_train, mentionlen_train, positions_train, labels_train))
        self.valid_set = list(zip(words_valid, textlen_valid, mentions_valid, mentionlen_valid, positions_valid, labels_valid))
        self.test_set = list(zip(words_test, textlen_test, mentions_test, mentionlen_test, positions_test, labels_test))
        self.full_test_set = list(zip(words, textlen, mentions, mentionlen, positions, labels))

        self.labels_test = labels_test
        self.labels = labels

        self.model_name = model_name
        self.data_name = data_name
        self.cv_runs = cv_runs
        self.params_dict = params_dict
        self.hparams = AttrDict(params_dict)
        self.logger = logger

        self.num_types = num_types
        self.type_info = type_info

        self.model = self._get_model()
        self.saver = tf.train.Saver(tf.global_variables())
        checkpoint_dir = os.path.abspath(config.CHECKPOINT_DIR)
        if not os.path.exists(checkpoint_dir):
            os.makedirs(checkpoint_dir)
        self.checkpoint_prefix = os.path.join(checkpoint_dir, self.__str__())
Exemple #55
0
def main(which):
    logname = "generate_feature_word2vec_%s_%s.log" % (which,
                                                       time_utils._timestamp())
    logger = logging_utils._get_logger(config.LOG_DIR, logname)
    #### NOTE: use data BEFORE STEMMinG
    dfAll = pkl_utils._load(config.ALL_DATA_LEMMATIZED)

    word2vec_model_dirs = []
    model_prefixes = []
    if which == "wikipedia":
        ## word2vec model pretrained with Wikipedia+Gigaword 5
        word2vec_model_dirs.append(config.GLOVE_WORD2VEC_MODEL_DIR +
                                   "/glove.6B.300d.txt")
        model_prefixes.append("Wikipedia")
    elif which == "google":
        ## word2vec model pretrained with Google News
        word2vec_model_dirs.append(config.WORD2VEC_MODEL_DIR +
                                   "/GoogleNews-vectors-negative300.bin")
        model_prefixes.append("GoogleNews")
    elif which == "quora":
        ## word2vec model trained with Quora dataset: question1/question2
        word2vec_model_dirs.append(
            config.WORD2VEC_MODEL_DIR +
            "/Quora-word2vec-D%d-min_count%d.model" %
            (config.EMBEDDING_DIM, config.EMBEDDING_MIN_COUNT))
        model_prefixes.append("Quora")
    print("word2vec mode: {}".format(which))

    for word2vec_model_dir, model_prefix in zip(word2vec_model_dirs,
                                                model_prefixes):
        ## load model
        try:
            if ".bin" in word2vec_model_dir:
                word2vec_model = gensim.models.Word2Vec.load_word2vec_format(
                    word2vec_model_dir, binary=True)
            elif ".txt" in word2vec_model_dir:
                #ipdb.set_trace()
                word2vec_model = gensim.models.Word2Vec.load_word2vec_format(
                    word2vec_model_dir, binary=False)
            else:
                word2vec_model = gensim.models.Word2Vec.load(
                    word2vec_model_dir)
        except:
            continue

        ## pairwise
        generators = [
            Word2Vec_Importance,
            Word2Vec_N_Similarity,
            Word2Vec_N_Similarity_Imp,
            Word2Vec_Centroid_RMSE,
            Word2Vec_Centroid_RMSE_IMP,
        ]
        obs_fields_list = [["question1"], ["question2"]]
        target_fields_list = [["question2"], ["question1"]]
        for obs_fields, target_fields in zip(obs_fields_list,
                                             target_fields_list):
            for generator in generators:
                param_list = [word2vec_model, model_prefix]
                pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields,
                                            target_fields, param_list,
                                            config.FEAT_DIR, logger)
                pf.go()

        ## cosine sim
        generators = [
            Word2Vec_CosineSim,
        ]
        # double aggregation
        aggregation_mode_prev = ["mean", "max", "min", "median"]
        aggregation_mode = ["mean", "std", "max", "min", "median"]
        for obs_fields, target_fields in zip(obs_fields_list,
                                             target_fields_list):
            for generator in generators:
                param_list = [
                    word2vec_model, model_prefix, aggregation_mode,
                    aggregation_mode_prev
                ]
                pf = PairwiseFeatureWrapper(generator, dfAll, obs_fields,
                                            target_fields, param_list,
                                            config.FEAT_DIR, logger)
                pf.go()