コード例 #1
0
import feather
import glob
import sys
import time
import os
import gc
from multiprocessing import Pool
from PIL import Image
from collections import Counter

import libavito as a

print(a.c.BOLD + 'Generating image info ...' + a.c.END)

# Get train/test mode from launch argument
mode = a.get_mode(sys.argv, '2_image_info.py')

## Read settings required by script
config = a.read_config()
nthreads = config.preprocessing_nthreads
cache_loc = config.cache_loc
debug = config.debug
root = config.images_root


# Function to compute difference hash of image
def DifferenceHash(img):
    theImage = Image.fromarray(img)
    # Convert the image to 8-bit grayscale.
    theImage = theImage.convert("L")  # 8-bit grayscale
    # Squeeze it down to an 8x8 image.
コード例 #2
0
        return intersection_cardinality / float(union_cardinality)


def ratio_of_matches(x, y):
    intersection_cardinality = len(set.intersection(*[set(x), set(y)]))
    x_cardinality = len(x)
    if x_cardinality == 0:
        return -1.0
    else:
        return intersection_cardinality / float(x_cardinality)


print(a.c.BOLD + 'Extracting set3b title features ...' + a.c.END)

# Get train/test mode from launch argument
mode = a.get_mode(sys.argv, '3_feature_set3b_title.py')

## Read settings required by script
config = a.read_config()
nthreads = config.preprocessing_nthreads
cache_loc = config.cache_loc
debug = config.debug
if mode == 0:
    root = config.train_images_root
    df = feather.read_dataframe(cache_loc + 'train.fthr')
if mode == 1:
    root = config.test_images_root
    df = feather.read_dataframe(cache_loc + 'test.fthr')

train = df[['itemID_1', 'itemID_2', 'cleantitle_1', 'cleantitle_2']]
del df
コード例 #3
0
        return intersection_cardinality / float(union_cardinality)


def ratio_of_matches(x, y):
    intersection_cardinality = len(set.intersection(*[set(x), set(y)]))
    x_cardinality = len(x)
    if x_cardinality == 0:
        return -1.0
    else:
        return intersection_cardinality / float(x_cardinality)


print(a.c.BOLD + 'Extracting set3c JSON features ...' + a.c.END)

# Get train/test mode from launch argument
mode = a.get_mode(sys.argv, '3_feature_set3c_json.py')

## Read settings required by script
config = a.read_config()
nthreads = config.preprocessing_nthreads
cache_loc = config.cache_loc
debug = config.debug
if mode == 0:
    root = config.train_images_root
    df = feather.read_dataframe(cache_loc + 'train.fthr')
if mode == 1:
    root = config.test_images_root
    df = feather.read_dataframe(cache_loc + 'test.fthr')

train = df[['itemID_1', 'itemID_2', 'attrsJSON_1', 'attrsJSON_2']]
del df
コード例 #4
0
        s1 = str(row[d])
        s2 = str(row[d + 1])
        values.append(jellyfish.levenshtein_distance(s1, s2))
        values.append(jellyfish.jaro_distance(s1, s2))
        #values.append(float(jellyfish.damerau_levenshtein_distance(s1,s2)) )
        values.append(fuzz.partial_ratio(s1, s2))
        values.append(fuzz.token_set_ratio(s1, s2))
        values.append(fuzz.ratio(s1, s2))
        values.append(fuzz.token_sort_ratio(s1, s2))
    return values


print(a.c.BOLD + 'Extracting set4b fuzzy cleaned text features ...' + a.c.END)

# Get train/test mode from launch argument
mode = a.get_mode(sys.argv, '3_feature_set4b_fuzzy_clean.py')

## Read settings required by script
config = a.read_config()
nthreads = config.preprocessing_nthreads
cache_loc = config.cache_loc
debug = config.debug
if mode == 0:
    root = config.train_images_root
    df = feather.read_dataframe(cache_loc + 'train.fthr')
if mode == 1:
    root = config.test_images_root
    df = feather.read_dataframe(cache_loc + 'test.fthr')

df = df[[
    'itemID_1', 'itemID_2', 'cleantitle_1', 'cleantitle_2', 'cleandesc_1',
コード例 #5
0
import time
import gc
import random
import sys
import Levenshtein  # pip install python-Levenshtein
from haversine import haversine

import libavito as a

# Noise to add to variables to prevent overfitting, a value between +- the selected value will be added to every instance
tot_lon_noise = 0.25
tot_lat_noise = 0.25
loc_dist_noise = 10

# Get train/test mode from launch argument
mode = a.get_mode(sys.argv, '3_feature_set2a_lev_loc.py')

## Read settings required by script
config = a.read_config()
nthreads = config.preprocessing_nthreads
cache_loc = config.cache_loc
debug = config.debug
if mode == 0:
    df = feather.read_dataframe(cache_loc + 'train.fthr')
if mode == 1:
    df = feather.read_dataframe(cache_loc + 'test.fthr')

# Create dataframe for features
x_all = pd.DataFrame()

random.seed(2016)
コード例 #6
0
    # Not black magic, iterate over title/description/json
    for d in [2, 4, 6]:
        st_1 = str(row[d])
        st_2 = str(row[d + 1])
        values.append(fuzz.partial_ratio(st_1, st_2))
        values.append(fuzz.token_set_ratio(st_1, st_2))
        values.append(fuzz.ratio(st_1, st_2))
        values.append(fuzz.token_sort_ratio(st_1, st_2))
    return values


print(a.c.BOLD + 'Extracting set4a fuzzy text features ...' + a.c.END)

# Get train/test mode from launch argument
mode = a.get_mode(sys.argv, '3_feature_set4a_fuzzy.py')

## Read settings required by script
config = a.read_config()
nthreads = config.preprocessing_nthreads
cache_loc = config.cache_loc
debug = config.debug
if mode == 0:
    root = config.train_images_root
    df = feather.read_dataframe(cache_loc + 'train.fthr')
if mode == 1:
    root = config.test_images_root
    df = feather.read_dataframe(cache_loc + 'test.fthr')

df = df[[
    'itemID_1', 'itemID_2', 'title_1', 'title_2', 'description_1',
コード例 #7
0
import sys
import feather
import time
import gc
from multiprocessing import Pool

import libavito as a

def debug(s):
    print(str(s))
    time.sleep(1)

print(a.c.BOLD + 'Extracting set3f image hamming features ...' + a.c.END)

# Get train/test mode from launch argument
mode = a.get_mode(sys.argv, '3_feature_set3f_hamming.py')

## Read settings required by script
config = a.read_config()
nthreads = config.preprocessing_nthreads
cache_loc = config.cache_loc
#debug = config.debug
if mode == 0:
    df = feather.read_dataframe(cache_loc + 'train.fthr')
if mode == 1:
    df = feather.read_dataframe(cache_loc + 'test.fthr')

root = config.images_root
image_db = feather.read_dataframe(cache_loc + 'image_database.fthr')

df = df[['itemID_1', 'itemID_2', 'images_array_1', 'images_array_2']]
コード例 #8
0
        return intersection_cardinality / float(union_cardinality)


def ratio_of_matches(x, y):
    intersection_cardinality = len(set.intersection(*[set(x), set(y)]))
    x_cardinality = len(x)
    if x_cardinality == 0:
        return -1.0
    else:
        return intersection_cardinality / float(x_cardinality)


print(a.c.BOLD + 'Extracting set3d JSON features ...' + a.c.END)

# Get train/test mode from launch argument
mode = a.get_mode(sys.argv, '3_feature_set3d_json1.py')

## Read settings required by script
config = a.read_config()
nthreads = config.preprocessing_nthreads
cache_loc = config.cache_loc
debug = config.debug
if mode == 0:
    root = config.train_images_root
    df = feather.read_dataframe(cache_loc + 'train.fthr')
if mode == 1:
    root = config.test_images_root
    df = feather.read_dataframe(cache_loc + 'test.fthr')

train = df[['itemID_1', 'itemID_2', 'attrsJSON_1', 'attrsJSON_2']]
del df
コード例 #9
0
        return intersection_cardinality / float(union_cardinality)


def ratio_of_matches(x, y):
    intersection_cardinality = len(set.intersection(*[set(x), set(y)]))
    x_cardinality = len(x)
    if x_cardinality == 0:
        return -1.0
    else:
        return intersection_cardinality / float(x_cardinality)


print(a.c.BOLD + 'Extracting set3a description features ...' + a.c.END)

# Get train/test mode from launch argument
mode = a.get_mode(sys.argv, '3_feature_set3a_description.py')

## Read settings required by script
config = a.read_config()
nthreads = config.preprocessing_nthreads
cache_loc = config.cache_loc
debug = config.debug
if mode == 0:
    root = config.train_images_root
    df = feather.read_dataframe(cache_loc + 'train.fthr')
if mode == 1:
    root = config.test_images_root
    df = feather.read_dataframe(cache_loc + 'test.fthr')

train = df[['itemID_1', 'itemID_2', 'cleandesc_1', 'cleandesc_2']]
del df
コード例 #10
0
    if where == 'ftr':
        print(str(e) + " FTR ERROR at " + str(x) + " !!!")
        if write_info is True:
            p = open(cache_loc + 'hist_errors_extract.txt', 'a')
            p.write('FTR ERROR at ' + str(x) + "\n")
            p.close()


print(a.c.BOLD + 'Extracting set2c image histogram/hue features ...' + a.c.END)

# Suppress expected warnings from pandas
warnings.filterwarnings(
    "ignore", message='mean of empty slice|all-nan (axis|slice) encountered')

# Get train/test mode from launch argument
mode = a.get_mode(sys.argv, '3_feature_set2c_hist.py')

## Read settings required by script
config = a.read_config()
nthreads = config.preprocessing_nthreads
cache_loc = config.cache_loc
debug = config.debug
if mode == 0:
    root = config.train_images_root
    df = feather.read_dataframe(cache_loc + 'train.fthr')
if mode == 1:
    root = config.test_images_root
    df = feather.read_dataframe(cache_loc + 'test.fthr')

# Select columns required by script
df = df[['images_array_1', 'images_array_2', 'itemID_1', 'itemID_2']]
コード例 #11
0
    def close(self):
        # Close the null files
        os.close(self.null_fds[0])
        os.close(self.null_fds[1])


def suppress_pool_init():
    # Open a pair of null files
    null_fds = [os.open(os.devnull, os.O_RDWR) for x in range(2)]
    # Assign the null pointers to stdout and stderr.
    os.dup2(null_fds[0], 1)
    os.dup2(null_fds[1], 2)


# Get train/test mode from launch argument
mode = a.get_mode(sys.argv, '3_feature_set2b_brisk.py')

## Read settings required by script
config = a.read_config()
nthreads = config.preprocessing_nthreads
cache_loc = config.cache_loc
debug = config.debug
if mode == 0:
    root = config.train_images_root
    df = feather.read_dataframe(cache_loc + 'train.fthr')
if mode == 1:
    root = config.test_images_root
    df = feather.read_dataframe(cache_loc + 'test.fthr')

# Select columns required by script
df = df[['itemID_1', 'itemID_2', 'images_array_1', 'images_array_2']]
コード例 #12
0
    counters = 0.0
    for d in [catid, locid, metid, pcatid, regid]:
        if row[d] == row[d + 1] and row[d] not in ["NA", np.nan]:
            values.append(1.0)
            counters += 1.0
        else:
            values.append(0.0)
    values.append(counters / 5.0)

    return values


print(a.c.BOLD + 'Extracting set4d clean similarity features ...' + a.c.END)

# Get train/test mode from launch argument
mode = a.get_mode(sys.argv, '3_feature_set4d_similarity_clean.py')

## Read settings required by script
config = a.read_config()
nthreads = config.preprocessing_nthreads
cache_loc = config.cache_loc
debug = config.debug
if mode == 0:
    root = config.train_images_root
    df = feather.read_dataframe(cache_loc + 'train.fthr')
if mode == 1:
    root = config.test_images_root
    df = feather.read_dataframe(cache_loc + 'test.fthr')

df = df[[
    'itemID_1', 'itemID_2', 'cleantitle_1', 'cleantitle_2', 'cleandesc_1',
コード例 #13
0
#########################

# Define cleaning parameters
stopwords = get_stop_words('ru')
exclude_cats = set([
    'Pc', 'Pd', 'Ps', 'Pe', 'Pi', 'Pf', 'Po', 'Sk', 'Sc', 'So', 'Co', 'Cf',
    'Cc', 'Cs', 'Cn'
])
sno = nltk.stem.SnowballStemmer('russian')

#########################

print(a.c.BOLD + 'Cleaning input data ...' + a.c.END)

# Get train/test mode from launch argument
mode = a.get_mode(sys.argv, '1_data_preprocessing.py')

## Read settings required by script
config = a.read_config()
nthreads = config.preprocessing_nthreads
cache_loc = config.cache_loc
category_loc = config.category_csv
location_loc = config.location_csv
debug = config.debug
if mode == 0:
    data_loc = config.train_ItemInfo
    pairs_loc = config.train_ItemPairs
if mode == 1:
    data_loc = config.test_ItemInfo
    pairs_loc = config.test_ItemPairs