def loadData(fpath="../data/",k=5,useDesc=True,labelFields=False,removeHTML=True,removePunc=False,removeDigits=False,stemming=None,SEED=55,load2vec=True):
	import pandas as pd
	from os import listdir
	from sklearn.cross_validation import KFold
	from helperFunctions import cleanText
	train = pd.read_csv(fpath+"train.csv").fillna('')
	cv = KFold(len(train), n_folds=k, shuffle=True, random_state=SEED)
	train['title'] = train.product_title
	del train['product_title']
	train['description'] = train.product_description
	del train['product_description']
	test = pd.read_csv(fpath+"test.csv").fillna('')
	test['title'] = test.product_title
	del test['product_title']
	test['description'] = test.product_description
	del test['product_description']
	#add extra vars
	train['query_words'] = [len(x.split(' ')) for x in train['query']]
	test['query_words'] = [len(x.split(' ')) for x in test['query']]
	train['title_len'] = [len(x) for x in train['title']]
	test['title_len'] = [len(x) for x in test['title']]
	train['desc_len'] = [len(x) for x in train['description']]
	test['desc_len'] = [len(x) for x in test['description']]
	#combine text fields
	if useDesc:
		vars = ['query','title','description']
	else:
		vars = ['query','title']
	for v in vars:
		train[v] = cleanText(train[v],removeHTML=removeHTML,removePunc=removePunc,removeDigits=removeDigits,stemming=stemming)
		test[v] = cleanText(test[v],removeHTML=removeHTML,removePunc=removePunc,removeDigits=removeDigits,stemming=stemming)	
	if labelFields:
		train['text'] = train.apply(lambda x: ' '.join([' '.join([v[0]+y for y in x[v].split(' ')]) for v in vars]),1)
		test['text'] = test.apply(lambda x: ' '.join([' '.join([v[0]+y for y in x[v].split(' ')]) for v in vars]),1)
	else:
		train['text'] = train.apply(lambda x: ' '.join([x[v] for v in vars]),1)
		test['text'] = test.apply(lambda x: ' '.join([x[v] for v in vars]),1)
	train['query_percent_title'] = train[['query','title']].apply(lambda row: 1.0*sum([q in row.title.lower().split(" ") for q in row['query'].split(" ")])/len(row['query'].split(" ")) ,1)
	test['query_percent_title'] = test[['query','title']].apply(lambda row: 1.0*sum([q in row.title.lower().split(" ") for q in row['query'].split(" ")])/len(row['query'].split(" ")) ,1)
	train['query_word_score'] = train.query_words*train.query_percent_title
	test['query_word_score'] = test.query_words*test.query_percent_title
	if load2vec:
		train_doc2vec = {}
		test_doc2vec = {}
		for f in  [x for x in listdir("../data/") if x.startswith("train_doc2vec")]:
			train_doc2vec[f] = pd.read_csv("../data/"+f)['doc2vec'].values
			test_doc2vec[f] = pd.read_csv("../data/"+f.replace('train','test'))['doc2vec'].values
		train['doc2vec'] = applyWeights(train_doc2vec,dict([(k,1.0/len(train_doc2vec)) for k in train_doc2vec.keys()]))
		test['doc2vec'] = applyWeights(test_doc2vec,dict([(k,1.0/len(test_doc2vec)) for k in test_doc2vec.keys()]))
		train_word2vec = {}
		test_word2vec = {}
		for f in  [x for x in listdir("../data/") if x.startswith("train_word2vec")]:
			train_word2vec[f] = pd.read_csv("../data/"+f)['word2vec'].values
			test_word2vec[f] = pd.read_csv("../data/"+f.replace('train','test'))['word2vec'].values
		train['word2vec'] = applyWeights(train_word2vec,dict([(k,1.0/len(train_word2vec)) for k in train_word2vec.keys()]))
		test['word2vec'] = applyWeights(test_word2vec,dict([(k,1.0/len(test_word2vec)) for k in test_word2vec.keys()]))
	return train.median_relevance, train, test, cv
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("--name", dest="fname", type=str)
parser.add_argument("--external", dest="external_path", type=str, default="../data/external.txt")
parser.add_argument("--path", dest="path", type=str,default="../data/")
parser.add_argument("--stem", dest="stemming", type=str,default="None")
parser.add_argument("--html", dest="removeHTML", action='store_true')
parser.add_argument("--punc", dest="removePunc", action='store_true')
parser.add_argument("--digits", dest="removeDigits", action='store_true')
parser.add_argument("--seed", dest="SEED", type=int,default=22)

args = parser.parse_args()

############
### Main ###
############

target, train, test, cv = loadData(k=2,useDesc=False,labelFields=False,removeHTML=args.removeHTML,removePunc=args.removePunc,removeDigits=args.removeDigits,stemming=args.stemming,SEED=args.SEED,load2vec=False)
train['query'] = [x.lower() for x in train['query']]


data = open(args.external_path,'rb').readlines()
data = cleanText(data,removeHTML=args.removeHTML,removePunc=args.removePunc,removeDigits=args.removeDigits,stemming=args.stemming)
data = [x.lower() for x in data]
open('../data/tmp.txt','wb').write('\n'.join(data))

word2vec = trainWord2Vec('../data/tmp.txt',train,test)

pd.DataFrame({'id':train.id,'word2vec':word2vec[0]}).to_csv("../data/train_word2vec_"+args.fname+".csv",index=False)
pd.DataFrame({'id':test.id,'word2vec':word2vec[1]}).to_csv("../data/test_word2vec_"+args.fname+".csv",index=False)
Ejemplo n.º 3
0
def loadData(fpath="../data/",
             k=5,
             useDesc=True,
             labelFields=False,
             removeHTML=True,
             removePunc=False,
             removeDigits=False,
             stemming=None,
             SEED=55,
             load2vec=True):
    import pandas as pd
    from os import listdir
    from sklearn.cross_validation import KFold
    from helperFunctions import cleanText
    train = pd.read_csv(fpath + "train.csv").fillna('')
    cv = KFold(len(train), n_folds=k, shuffle=True, random_state=SEED)
    train['title'] = train.product_title
    del train['product_title']
    train['description'] = train.product_description
    del train['product_description']
    test = pd.read_csv(fpath + "test.csv").fillna('')
    test['title'] = test.product_title
    del test['product_title']
    test['description'] = test.product_description
    del test['product_description']
    #add extra vars
    train['query_words'] = [len(x.split(' ')) for x in train['query']]
    test['query_words'] = [len(x.split(' ')) for x in test['query']]
    train['title_len'] = [len(x) for x in train['title']]
    test['title_len'] = [len(x) for x in test['title']]
    train['desc_len'] = [len(x) for x in train['description']]
    test['desc_len'] = [len(x) for x in test['description']]
    #combine text fields
    if useDesc:
        vars = ['query', 'title', 'description']
    else:
        vars = ['query', 'title']
    for v in vars:
        train[v] = cleanText(train[v],
                             removeHTML=removeHTML,
                             removePunc=removePunc,
                             removeDigits=removeDigits,
                             stemming=stemming)
        test[v] = cleanText(test[v],
                            removeHTML=removeHTML,
                            removePunc=removePunc,
                            removeDigits=removeDigits,
                            stemming=stemming)
    if labelFields:
        train['text'] = train.apply(
            lambda x: ' '.join(
                [' '.join([v[0] + y for y in x[v].split(' ')]) for v in vars]),
            1)
        test['text'] = test.apply(
            lambda x: ' '.join(
                [' '.join([v[0] + y for y in x[v].split(' ')]) for v in vars]),
            1)
    else:
        train['text'] = train.apply(lambda x: ' '.join([x[v] for v in vars]),
                                    1)
        test['text'] = test.apply(lambda x: ' '.join([x[v] for v in vars]), 1)
    train['query_percent_title'] = train[['query', 'title']].apply(
        lambda row: 1.0 * sum([
            q in row.title.lower().split(" ") for q in row['query'].split(" ")
        ]) / len(row['query'].split(" ")), 1)
    test['query_percent_title'] = test[['query', 'title']].apply(
        lambda row: 1.0 * sum([
            q in row.title.lower().split(" ") for q in row['query'].split(" ")
        ]) / len(row['query'].split(" ")), 1)
    train['query_word_score'] = train.query_words * train.query_percent_title
    test['query_word_score'] = test.query_words * test.query_percent_title
    if load2vec:
        train_doc2vec = {}
        test_doc2vec = {}
        for f in [
                x for x in listdir("../data/") if x.startswith("train_doc2vec")
        ]:
            train_doc2vec[f] = pd.read_csv("../data/" + f)['doc2vec'].values
            test_doc2vec[f] = pd.read_csv(
                "../data/" + f.replace('train', 'test'))['doc2vec'].values
        train['doc2vec'] = applyWeights(
            train_doc2vec,
            dict([(k, 1.0 / len(train_doc2vec))
                  for k in train_doc2vec.keys()]))
        test['doc2vec'] = applyWeights(
            test_doc2vec,
            dict([(k, 1.0 / len(test_doc2vec)) for k in test_doc2vec.keys()]))
        train_word2vec = {}
        test_word2vec = {}
        for f in [
                x for x in listdir("../data/")
                if x.startswith("train_word2vec")
        ]:
            train_word2vec[f] = pd.read_csv("../data/" + f)['word2vec'].values
            test_word2vec[f] = pd.read_csv(
                "../data/" + f.replace('train', 'test'))['word2vec'].values
        train['word2vec'] = applyWeights(
            train_word2vec,
            dict([(k, 1.0 / len(train_word2vec))
                  for k in train_word2vec.keys()]))
        test['word2vec'] = applyWeights(
            test_word2vec,
            dict([(k, 1.0 / len(test_word2vec))
                  for k in test_word2vec.keys()]))
    return train.median_relevance, train, test, cv
target, train, test, cv = loadData(k=2,
                                   useDesc=False,
                                   labelFields=False,
                                   removeHTML=args.removeHTML,
                                   removePunc=args.removePunc,
                                   removeDigits=args.removeDigits,
                                   stemming=args.stemming,
                                   SEED=args.SEED,
                                   load2vec=False)
train['query'] = [x.lower() for x in train['query']]

data = open(args.external_path, 'rb').readlines()
data = cleanText(data,
                 removeHTML=args.removeHTML,
                 removePunc=args.removePunc,
                 removeDigits=args.removeDigits,
                 stemming=args.stemming)
data = [x.lower() for x in data]
open('../data/tmp.txt', 'wb').write('\n'.join(data))

word2vec = trainWord2Vec('../data/tmp.txt', train, test)

pd.DataFrame({
    'id': train.id,
    'word2vec': word2vec[0]
}).to_csv("../data/train_word2vec_" + args.fname + ".csv", index=False)
pd.DataFrame({
    'id': test.id,
    'word2vec': word2vec[1]
}).to_csv("../data/test_word2vec_" + args.fname + ".csv", index=False)
parser.add_argument("--stem", dest="stemming", type=str,default="None")
parser.add_argument("--html", dest="removeHTML", action='store_true')
parser.add_argument("--punc", dest="removePunc", action='store_true')
parser.add_argument("--digits", dest="removeDigits", action='store_true')
parser.add_argument("--seed", dest="SEED", type=int,default=22)

args = parser.parse_args()

############
### Main ###
############

target, train, test, cv = loadData(k=2,useDesc=False,labelFields=False,removeHTML=args.removeHTML,removePunc=args.removePunc,removeDigits=args.removeDigits,stemming=args.stemming,SEED=args.SEED,load2vec=False)

external = pd.DataFrame(json.load(open(args.external_path,'rb')))
del external['rank']
external['query'] = cleanText(external['query'],removeHTML=args.removeHTML,removePunc=args.removePunc,removeDigits=args.removeDigits,stemming=args.stemming)
external['title'] = cleanText(external['title'],removeHTML=args.removeHTML,removePunc=args.removePunc,removeDigits=args.removeDigits,stemming=args.stemming)
for v in ['query','title']:
	external[v] = [x.lower() for x in external[v]]
	train[v] = [x.lower() for x in train[v]]
	test[v] = [x.lower() for x in test[v]]

external = external.append(train[target == 4][['query','title']])
external = external.reset_index()

doc2vec = trainDoc2Vec(external['query'],external['title'],train,test)

pd.DataFrame({'id':train.id,'doc2vec':doc2vec[0]}).to_csv("../data/train_doc2vec_"+args.fname+".csv",index=False)
pd.DataFrame({'id':test.id,'doc2vec':doc2vec[1]}).to_csv("../data/test_doc2vec_"+args.fname+".csv",index=False)