コード例 #1
0
from __future__ import print_function

import numpy as np
import pandas as pd
from sklearn.ensemble import BaggingRegressor

from myml.nn import NnRegression
from myml.files import load

from utils import cv_generate

X2, X2_test, y = load("data/XXtestY250_r2td")

X_extra = pd.read_csv('data/ngramMatch_07.csv').values
X_extra_test = pd.read_csv('data/ngramMatch_test_07.csv').values
X_1234 = load('data/train1234_c1_r')
X_test_1234 = load('data/test1234_c1_r')
X_1234_2 = load('data/train1234_2_c1_r')
X_test_1234_2 = load('data/test1234_2_c1_r')
X_1234_3 = load('data/train1234_3_c1_r')
X_test_1234_3 = load('data/test1234_3_c1_r')
X_anti_1234 = load('data/train_anti_1234_c1_r')
X_test_anti_1234 = load('data/test_anti_1234_c1_r')

X_union_f, X_test_union_f = load('data/XXunion_f_norm')

train_test_alt = pd.read_csv('data/alt_query_features_train_and_test_v01.csv').values
train_alt = train_test_alt[:10158]
test_alt = train_test_alt[10158:]

train_ngram = load('data/train1234_ngram_r')
コード例 #2
0
for i in range(len(stop_words)):
    stop_words[i]=stemmer.stem(stop_words[i])
stop_words = set(stop_words)

def stem_one(w):
    return stemmer.stem(w)

def process_str(s):
    s = s.lower()
    #s = re.sub("- [a-z\/]+$", '', s)
    s = " ".join([z for z in BeautifulSoup(s).get_text(" ").split(" ")])
    s = re.sub("[^a-z0-9]", " ", s)
    wx = [stemmer.stem(z) for z in s.split(" ") if z]
    return wx

query_auto_correct = load('data/query_auto_correct')

def process_str_replace(s):
    s = s.lower()
    if s in query_auto_correct:
        #print(s, query_auto_correct[s])
        s = query_auto_correct[s]
    #s = re.sub("- [a-z\/]+$", '', s)
    s = " ".join([z for z in BeautifulSoup(s).get_text(" ").split(" ")])
    #s = re.sub("([/ -]{1,}(purple|red|blue|white|black|green|pink|yellow|grey|silver|clear|small|large|medium|m|X|2X|xl|navy|aqua|brown|brown leather|sealed|nib|new)[ ]?){1,}$", " ", s)
    s = re.sub("[^a-z0-9]", " ", s)
    sx = re.split(r'( |\b\d+|\d+\b)', s)
    sx = [w.strip() for w in sx]
    wx = [stemmer.stem(z) for z in sx if z]
    rez_wx = []
    for w in wx:
コード例 #3
0
from __future__ import print_function

import numpy as np
import pandas as pd
from sklearn.ensemble import BaggingRegressor

from myml.nn import NnRegression
from myml.files import load

from utils import cv_generate

X2, X2_test, y = load("data/XXtestY250_r2td")

X_extra = pd.read_csv('data/ngramMatch_07.csv').values
X_extra_test = pd.read_csv('data/ngramMatch_test_07.csv').values
X_1234 = load('data/train1234_c1_r')
X_test_1234 = load('data/test1234_c1_r')
X_1234_2 = load('data/train1234_2_c1_r')
X_test_1234_2 = load('data/test1234_2_c1_r')
X_1234_3 = load('data/train1234_3_c1_r')
X_test_1234_3 = load('data/test1234_3_c1_r')
X_anti_1234 = load('data/train_anti_1234_c1_r')
X_test_anti_1234 = load('data/test_anti_1234_c1_r')

X_union_f, X_test_union_f = load('data/XXunion_f_norm')

train_test_alt = pd.read_csv(
    'data/alt_query_features_train_and_test_v01.csv').values
train_alt = train_test_alt[:10158]
test_alt = train_test_alt[10158:]
コード例 #4
0
for i in range(len(stop_words)):
    stop_words[i]=stemmer.stem(stop_words[i])
stop_words = set(stop_words)

def stem_one(w):
    return stemmer.stem(w)

def process_str(s):
    s = s.lower()
    #s = re.sub("- [a-z\/]+$", '', s)
    s = " ".join([z for z in BeautifulSoup(s).get_text(" ").split(" ")])
    s = re.sub("[^a-z0-9]", " ", s)
    wx = [stemmer.stem(z) for z in s.split(" ") if z]
    return wx

long_word_replace = load('data/word_to_2_replacer')

query_auto_correct = load('data/query_auto_correct')
num_to_num = load('data/num_to_num')

def process_str_replace(s):
    s = s.lower()
    if s in query_auto_correct:
        #print(s, query_auto_correct[s])
        s = query_auto_correct[s]
    #s = re.sub("- [a-z\/]+$", '', s)
    s = " ".join([z for z in BeautifulSoup(s).get_text(" ").split(" ")])
    #s = re.sub("([/ -]{1,}(purple|red|blue|white|black|green|pink|yellow|grey|silver|clear|small|large|medium|m|X|2X|xl|navy|aqua|brown|brown leather|sealed|nib|new)[ ]?){1,}$", " ", s)
    s = re.sub("[^a-z0-9]", " ", s)
    sx = re.split(r'( |\b\d+|\d+\b)', s)
    sx = [w.strip() for w in sx]
コード例 #5
0
ファイル: utils.py プロジェクト: h3nj3/crowdflower-search

def stem_one(w):
    return stemmer.stem(w)


def process_str(s):
    s = s.lower()
    # s = re.sub("- [a-z\/]+$", '', s)
    s = " ".join([z for z in BeautifulSoup(s).get_text(" ").split(" ")])
    s = re.sub("[^a-z0-9]", " ", s)
    wx = [stemmer.stem(z) for z in s.split(" ") if z]
    return wx


long_word_replace = load("data/word_to_2_replacer")

query_auto_correct = load("data/query_auto_correct")
num_to_num = load("data/num_to_num")


def process_str_replace(s):
    s = s.lower()
    if s in query_auto_correct:
        # print(s, query_auto_correct[s])
        s = query_auto_correct[s]
    # s = re.sub("- [a-z\/]+$", '', s)
    s = " ".join([z for z in BeautifulSoup(s).get_text(" ").split(" ")])
    # s = re.sub("([/ -]{1,}(purple|red|blue|white|black|green|pink|yellow|grey|silver|clear|small|large|medium|m|X|2X|xl|navy|aqua|brown|brown leather|sealed|nib|new)[ ]?){1,}$", " ", s)
    s = re.sub("[^a-z0-9]", " ", s)
    sx = re.split(r"( |\b\d+|\d+\b)", s)
コード例 #6
0
from nltk.corpus import stopwords
import numpy.linalg as LA

from myml.files import dump, load
from utils import process_str_replace
from myml.utils import clr_print


# In[3]:

process_str = process_str_replace


# In[4]:

X2, X2_test, y = load("data/XXtestY250_r2td")


# In[5]:

train = pd.read_csv("input/train.csv").fillna("")
test = pd.read_csv("input/test.csv").fillna("")
train = train.drop('id', axis=1)
test = test.drop('id', axis=1)
train.head(3)


# In[6]:

qx = set(train['query'].values)