from __future__ import print_function import numpy as np import pandas as pd from sklearn.ensemble import BaggingRegressor from myml.nn import NnRegression from myml.files import load from utils import cv_generate X2, X2_test, y = load("data/XXtestY250_r2td") X_extra = pd.read_csv('data/ngramMatch_07.csv').values X_extra_test = pd.read_csv('data/ngramMatch_test_07.csv').values X_1234 = load('data/train1234_c1_r') X_test_1234 = load('data/test1234_c1_r') X_1234_2 = load('data/train1234_2_c1_r') X_test_1234_2 = load('data/test1234_2_c1_r') X_1234_3 = load('data/train1234_3_c1_r') X_test_1234_3 = load('data/test1234_3_c1_r') X_anti_1234 = load('data/train_anti_1234_c1_r') X_test_anti_1234 = load('data/test_anti_1234_c1_r') X_union_f, X_test_union_f = load('data/XXunion_f_norm') train_test_alt = pd.read_csv('data/alt_query_features_train_and_test_v01.csv').values train_alt = train_test_alt[:10158] test_alt = train_test_alt[10158:] train_ngram = load('data/train1234_ngram_r')
for i in range(len(stop_words)): stop_words[i]=stemmer.stem(stop_words[i]) stop_words = set(stop_words) def stem_one(w): return stemmer.stem(w) def process_str(s): s = s.lower() #s = re.sub("- [a-z\/]+$", '', s) s = " ".join([z for z in BeautifulSoup(s).get_text(" ").split(" ")]) s = re.sub("[^a-z0-9]", " ", s) wx = [stemmer.stem(z) for z in s.split(" ") if z] return wx query_auto_correct = load('data/query_auto_correct') def process_str_replace(s): s = s.lower() if s in query_auto_correct: #print(s, query_auto_correct[s]) s = query_auto_correct[s] #s = re.sub("- [a-z\/]+$", '', s) s = " ".join([z for z in BeautifulSoup(s).get_text(" ").split(" ")]) #s = re.sub("([/ -]{1,}(purple|red|blue|white|black|green|pink|yellow|grey|silver|clear|small|large|medium|m|X|2X|xl|navy|aqua|brown|brown leather|sealed|nib|new)[ ]?){1,}$", " ", s) s = re.sub("[^a-z0-9]", " ", s) sx = re.split(r'( |\b\d+|\d+\b)', s) sx = [w.strip() for w in sx] wx = [stemmer.stem(z) for z in sx if z] rez_wx = [] for w in wx:
from __future__ import print_function import numpy as np import pandas as pd from sklearn.ensemble import BaggingRegressor from myml.nn import NnRegression from myml.files import load from utils import cv_generate X2, X2_test, y = load("data/XXtestY250_r2td") X_extra = pd.read_csv('data/ngramMatch_07.csv').values X_extra_test = pd.read_csv('data/ngramMatch_test_07.csv').values X_1234 = load('data/train1234_c1_r') X_test_1234 = load('data/test1234_c1_r') X_1234_2 = load('data/train1234_2_c1_r') X_test_1234_2 = load('data/test1234_2_c1_r') X_1234_3 = load('data/train1234_3_c1_r') X_test_1234_3 = load('data/test1234_3_c1_r') X_anti_1234 = load('data/train_anti_1234_c1_r') X_test_anti_1234 = load('data/test_anti_1234_c1_r') X_union_f, X_test_union_f = load('data/XXunion_f_norm') train_test_alt = pd.read_csv( 'data/alt_query_features_train_and_test_v01.csv').values train_alt = train_test_alt[:10158] test_alt = train_test_alt[10158:]
for i in range(len(stop_words)): stop_words[i]=stemmer.stem(stop_words[i]) stop_words = set(stop_words) def stem_one(w): return stemmer.stem(w) def process_str(s): s = s.lower() #s = re.sub("- [a-z\/]+$", '', s) s = " ".join([z for z in BeautifulSoup(s).get_text(" ").split(" ")]) s = re.sub("[^a-z0-9]", " ", s) wx = [stemmer.stem(z) for z in s.split(" ") if z] return wx long_word_replace = load('data/word_to_2_replacer') query_auto_correct = load('data/query_auto_correct') num_to_num = load('data/num_to_num') def process_str_replace(s): s = s.lower() if s in query_auto_correct: #print(s, query_auto_correct[s]) s = query_auto_correct[s] #s = re.sub("- [a-z\/]+$", '', s) s = " ".join([z for z in BeautifulSoup(s).get_text(" ").split(" ")]) #s = re.sub("([/ -]{1,}(purple|red|blue|white|black|green|pink|yellow|grey|silver|clear|small|large|medium|m|X|2X|xl|navy|aqua|brown|brown leather|sealed|nib|new)[ ]?){1,}$", " ", s) s = re.sub("[^a-z0-9]", " ", s) sx = re.split(r'( |\b\d+|\d+\b)', s) sx = [w.strip() for w in sx]
def stem_one(w): return stemmer.stem(w) def process_str(s): s = s.lower() # s = re.sub("- [a-z\/]+$", '', s) s = " ".join([z for z in BeautifulSoup(s).get_text(" ").split(" ")]) s = re.sub("[^a-z0-9]", " ", s) wx = [stemmer.stem(z) for z in s.split(" ") if z] return wx long_word_replace = load("data/word_to_2_replacer") query_auto_correct = load("data/query_auto_correct") num_to_num = load("data/num_to_num") def process_str_replace(s): s = s.lower() if s in query_auto_correct: # print(s, query_auto_correct[s]) s = query_auto_correct[s] # s = re.sub("- [a-z\/]+$", '', s) s = " ".join([z for z in BeautifulSoup(s).get_text(" ").split(" ")]) # s = re.sub("([/ -]{1,}(purple|red|blue|white|black|green|pink|yellow|grey|silver|clear|small|large|medium|m|X|2X|xl|navy|aqua|brown|brown leather|sealed|nib|new)[ ]?){1,}$", " ", s) s = re.sub("[^a-z0-9]", " ", s) sx = re.split(r"( |\b\d+|\d+\b)", s)
from nltk.corpus import stopwords import numpy.linalg as LA from myml.files import dump, load from utils import process_str_replace from myml.utils import clr_print # In[3]: process_str = process_str_replace # In[4]: X2, X2_test, y = load("data/XXtestY250_r2td") # In[5]: train = pd.read_csv("input/train.csv").fillna("") test = pd.read_csv("input/test.csv").fillna("") train = train.drop('id', axis=1) test = test.drop('id', axis=1) train.head(3) # In[6]: qx = set(train['query'].values)