def loadData(fpath="../data/",k=5,useDesc=True,labelFields=False,removeHTML=True,removePunc=False,removeDigits=False,stemming=None,SEED=55,load2vec=True): import pandas as pd from os import listdir from sklearn.cross_validation import KFold from helperFunctions import cleanText train = pd.read_csv(fpath+"train.csv").fillna('') cv = KFold(len(train), n_folds=k, shuffle=True, random_state=SEED) train['title'] = train.product_title del train['product_title'] train['description'] = train.product_description del train['product_description'] test = pd.read_csv(fpath+"test.csv").fillna('') test['title'] = test.product_title del test['product_title'] test['description'] = test.product_description del test['product_description'] #add extra vars train['query_words'] = [len(x.split(' ')) for x in train['query']] test['query_words'] = [len(x.split(' ')) for x in test['query']] train['title_len'] = [len(x) for x in train['title']] test['title_len'] = [len(x) for x in test['title']] train['desc_len'] = [len(x) for x in train['description']] test['desc_len'] = [len(x) for x in test['description']] #combine text fields if useDesc: vars = ['query','title','description'] else: vars = ['query','title'] for v in vars: train[v] = cleanText(train[v],removeHTML=removeHTML,removePunc=removePunc,removeDigits=removeDigits,stemming=stemming) test[v] = cleanText(test[v],removeHTML=removeHTML,removePunc=removePunc,removeDigits=removeDigits,stemming=stemming) if labelFields: train['text'] = train.apply(lambda x: ' '.join([' '.join([v[0]+y for y in x[v].split(' ')]) for v in vars]),1) test['text'] = test.apply(lambda x: ' '.join([' '.join([v[0]+y for y in x[v].split(' ')]) for v in vars]),1) else: train['text'] = train.apply(lambda x: ' '.join([x[v] for v in vars]),1) test['text'] = test.apply(lambda x: ' '.join([x[v] for v in vars]),1) train['query_percent_title'] = train[['query','title']].apply(lambda row: 1.0*sum([q in row.title.lower().split(" ") for q in row['query'].split(" ")])/len(row['query'].split(" ")) ,1) test['query_percent_title'] = test[['query','title']].apply(lambda row: 1.0*sum([q in row.title.lower().split(" ") for q in row['query'].split(" ")])/len(row['query'].split(" ")) ,1) train['query_word_score'] = train.query_words*train.query_percent_title test['query_word_score'] = test.query_words*test.query_percent_title if load2vec: train_doc2vec = {} test_doc2vec = {} for f in [x for x in listdir("../data/") if x.startswith("train_doc2vec")]: train_doc2vec[f] = pd.read_csv("../data/"+f)['doc2vec'].values test_doc2vec[f] = pd.read_csv("../data/"+f.replace('train','test'))['doc2vec'].values train['doc2vec'] = applyWeights(train_doc2vec,dict([(k,1.0/len(train_doc2vec)) for k in train_doc2vec.keys()])) test['doc2vec'] = applyWeights(test_doc2vec,dict([(k,1.0/len(test_doc2vec)) for k in test_doc2vec.keys()])) train_word2vec = {} test_word2vec = {} for f in [x for x in listdir("../data/") if x.startswith("train_word2vec")]: train_word2vec[f] = pd.read_csv("../data/"+f)['word2vec'].values test_word2vec[f] = pd.read_csv("../data/"+f.replace('train','test'))['word2vec'].values train['word2vec'] = applyWeights(train_word2vec,dict([(k,1.0/len(train_word2vec)) for k in train_word2vec.keys()])) test['word2vec'] = applyWeights(test_word2vec,dict([(k,1.0/len(test_word2vec)) for k in test_word2vec.keys()])) return train.median_relevance, train, test, cv
parser = argparse.ArgumentParser(description=__doc__) parser.add_argument("--name", dest="fname", type=str) parser.add_argument("--external", dest="external_path", type=str, default="../data/external.txt") parser.add_argument("--path", dest="path", type=str,default="../data/") parser.add_argument("--stem", dest="stemming", type=str,default="None") parser.add_argument("--html", dest="removeHTML", action='store_true') parser.add_argument("--punc", dest="removePunc", action='store_true') parser.add_argument("--digits", dest="removeDigits", action='store_true') parser.add_argument("--seed", dest="SEED", type=int,default=22) args = parser.parse_args() ############ ### Main ### ############ target, train, test, cv = loadData(k=2,useDesc=False,labelFields=False,removeHTML=args.removeHTML,removePunc=args.removePunc,removeDigits=args.removeDigits,stemming=args.stemming,SEED=args.SEED,load2vec=False) train['query'] = [x.lower() for x in train['query']] data = open(args.external_path,'rb').readlines() data = cleanText(data,removeHTML=args.removeHTML,removePunc=args.removePunc,removeDigits=args.removeDigits,stemming=args.stemming) data = [x.lower() for x in data] open('../data/tmp.txt','wb').write('\n'.join(data)) word2vec = trainWord2Vec('../data/tmp.txt',train,test) pd.DataFrame({'id':train.id,'word2vec':word2vec[0]}).to_csv("../data/train_word2vec_"+args.fname+".csv",index=False) pd.DataFrame({'id':test.id,'word2vec':word2vec[1]}).to_csv("../data/test_word2vec_"+args.fname+".csv",index=False)
def loadData(fpath="../data/", k=5, useDesc=True, labelFields=False, removeHTML=True, removePunc=False, removeDigits=False, stemming=None, SEED=55, load2vec=True): import pandas as pd from os import listdir from sklearn.cross_validation import KFold from helperFunctions import cleanText train = pd.read_csv(fpath + "train.csv").fillna('') cv = KFold(len(train), n_folds=k, shuffle=True, random_state=SEED) train['title'] = train.product_title del train['product_title'] train['description'] = train.product_description del train['product_description'] test = pd.read_csv(fpath + "test.csv").fillna('') test['title'] = test.product_title del test['product_title'] test['description'] = test.product_description del test['product_description'] #add extra vars train['query_words'] = [len(x.split(' ')) for x in train['query']] test['query_words'] = [len(x.split(' ')) for x in test['query']] train['title_len'] = [len(x) for x in train['title']] test['title_len'] = [len(x) for x in test['title']] train['desc_len'] = [len(x) for x in train['description']] test['desc_len'] = [len(x) for x in test['description']] #combine text fields if useDesc: vars = ['query', 'title', 'description'] else: vars = ['query', 'title'] for v in vars: train[v] = cleanText(train[v], removeHTML=removeHTML, removePunc=removePunc, removeDigits=removeDigits, stemming=stemming) test[v] = cleanText(test[v], removeHTML=removeHTML, removePunc=removePunc, removeDigits=removeDigits, stemming=stemming) if labelFields: train['text'] = train.apply( lambda x: ' '.join( [' '.join([v[0] + y for y in x[v].split(' ')]) for v in vars]), 1) test['text'] = test.apply( lambda x: ' '.join( [' '.join([v[0] + y for y in x[v].split(' ')]) for v in vars]), 1) else: train['text'] = train.apply(lambda x: ' '.join([x[v] for v in vars]), 1) test['text'] = test.apply(lambda x: ' '.join([x[v] for v in vars]), 1) train['query_percent_title'] = train[['query', 'title']].apply( lambda row: 1.0 * sum([ q in row.title.lower().split(" ") for q in row['query'].split(" ") ]) / len(row['query'].split(" ")), 1) test['query_percent_title'] = test[['query', 'title']].apply( lambda row: 1.0 * sum([ q in row.title.lower().split(" ") for q in row['query'].split(" ") ]) / len(row['query'].split(" ")), 1) train['query_word_score'] = train.query_words * train.query_percent_title test['query_word_score'] = test.query_words * test.query_percent_title if load2vec: train_doc2vec = {} test_doc2vec = {} for f in [ x for x in listdir("../data/") if x.startswith("train_doc2vec") ]: train_doc2vec[f] = pd.read_csv("../data/" + f)['doc2vec'].values test_doc2vec[f] = pd.read_csv( "../data/" + f.replace('train', 'test'))['doc2vec'].values train['doc2vec'] = applyWeights( train_doc2vec, dict([(k, 1.0 / len(train_doc2vec)) for k in train_doc2vec.keys()])) test['doc2vec'] = applyWeights( test_doc2vec, dict([(k, 1.0 / len(test_doc2vec)) for k in test_doc2vec.keys()])) train_word2vec = {} test_word2vec = {} for f in [ x for x in listdir("../data/") if x.startswith("train_word2vec") ]: train_word2vec[f] = pd.read_csv("../data/" + f)['word2vec'].values test_word2vec[f] = pd.read_csv( "../data/" + f.replace('train', 'test'))['word2vec'].values train['word2vec'] = applyWeights( train_word2vec, dict([(k, 1.0 / len(train_word2vec)) for k in train_word2vec.keys()])) test['word2vec'] = applyWeights( test_word2vec, dict([(k, 1.0 / len(test_word2vec)) for k in test_word2vec.keys()])) return train.median_relevance, train, test, cv
target, train, test, cv = loadData(k=2, useDesc=False, labelFields=False, removeHTML=args.removeHTML, removePunc=args.removePunc, removeDigits=args.removeDigits, stemming=args.stemming, SEED=args.SEED, load2vec=False) train['query'] = [x.lower() for x in train['query']] data = open(args.external_path, 'rb').readlines() data = cleanText(data, removeHTML=args.removeHTML, removePunc=args.removePunc, removeDigits=args.removeDigits, stemming=args.stemming) data = [x.lower() for x in data] open('../data/tmp.txt', 'wb').write('\n'.join(data)) word2vec = trainWord2Vec('../data/tmp.txt', train, test) pd.DataFrame({ 'id': train.id, 'word2vec': word2vec[0] }).to_csv("../data/train_word2vec_" + args.fname + ".csv", index=False) pd.DataFrame({ 'id': test.id, 'word2vec': word2vec[1] }).to_csv("../data/test_word2vec_" + args.fname + ".csv", index=False)
parser.add_argument("--stem", dest="stemming", type=str,default="None") parser.add_argument("--html", dest="removeHTML", action='store_true') parser.add_argument("--punc", dest="removePunc", action='store_true') parser.add_argument("--digits", dest="removeDigits", action='store_true') parser.add_argument("--seed", dest="SEED", type=int,default=22) args = parser.parse_args() ############ ### Main ### ############ target, train, test, cv = loadData(k=2,useDesc=False,labelFields=False,removeHTML=args.removeHTML,removePunc=args.removePunc,removeDigits=args.removeDigits,stemming=args.stemming,SEED=args.SEED,load2vec=False) external = pd.DataFrame(json.load(open(args.external_path,'rb'))) del external['rank'] external['query'] = cleanText(external['query'],removeHTML=args.removeHTML,removePunc=args.removePunc,removeDigits=args.removeDigits,stemming=args.stemming) external['title'] = cleanText(external['title'],removeHTML=args.removeHTML,removePunc=args.removePunc,removeDigits=args.removeDigits,stemming=args.stemming) for v in ['query','title']: external[v] = [x.lower() for x in external[v]] train[v] = [x.lower() for x in train[v]] test[v] = [x.lower() for x in test[v]] external = external.append(train[target == 4][['query','title']]) external = external.reset_index() doc2vec = trainDoc2Vec(external['query'],external['title'],train,test) pd.DataFrame({'id':train.id,'doc2vec':doc2vec[0]}).to_csv("../data/train_doc2vec_"+args.fname+".csv",index=False) pd.DataFrame({'id':test.id,'doc2vec':doc2vec[1]}).to_csv("../data/test_doc2vec_"+args.fname+".csv",index=False)