o = len(files) if nthreads == 1: print('Extracting image info with 1 thread ...') k = 0 # Iterate over files for f in files: x = process_line(f) l_id.append(x[0]) l_width.append(x[1]) l_height.append(x[2]) l_ratio.append(x[3]) l_hash.append(x[4]) l_size.append(x[5]) k += 1 if k % 1000 == 0: a.print_progress(k, start, o) # Otherwise perform multi-threaded mapping else: print('Extracting image info multi-threaded ... ', end='', flush=True) pool = Pool(nthreads) newdata = pool.map(process_line, files) pool.close() for x in newdata: l_id.append(x[0]) l_width.append(x[1]) l_height.append(x[2]) l_ratio.append(x[3]) l_hash.append(x[4]) l_size.append(x[5]) del newdata gc.collect()
mat1_t = ratio_of_matches(tx, ty) mat2_t = ratio_of_matches(ty, tx) return [ train.iloc[i]['itemID_1'], train.iloc[i]['itemID_2'], sim_t, mat1_t, mat2_t, len(tx), len(ty) ] t0 = time.time() if nthreads == 1: print('Extracting features with 1 thread ...') for i in range(0, len(train.index)): if i % 10000 == 0: a.print_progress(i, t0, len(train.index)) ftrs.append(process_row(i)) else: print('Extracting features multi-threaded ... ', end='', flush=True) pool = Pool(nthreads) ftrs = pool.map(process_row, range(0, len(train.index))) pool.close() a.print_elapsed(t0) start = time.time() print('Caching data to disk ... ', end='', flush=True) ftrs = pd.DataFrame(ftrs) ftrs.columns = [ 'itemID_1', 'itemID_2', 'simtitle', 'mattitle1', 'mattitle2', 'nwords1', 'nwords2' ]
df_test = feather.read_dataframe(cache_loc + 'test.fthr') df_train = df_train[['itemID_1', 'itemID_2', 'cleanjson_1', 'cleanjson_2']] df_test = df_test[['itemID_1', 'itemID_2', 'cleanjson_1', 'cleanjson_2']] df = pd.concat([df_train, df_test]) clean_jsons = df['cleanjson_1'].tolist() + df['cleanjson_2'].tolist() print('Creating key dict ... ') allkey = {} pa = 0 t0 = time.time() for i in range(0, len(clean_jsons)): if i % 100000 == 0: a.print_progress(i, t0, len(clean_jsons)) try: jx = clean_jsons[i].replace("'", "") resx = json.loads(jx) for x in resx.keys(): if x in allkey: allkey[x] = allkey[x] + 1 else: allkey[x] = 1 except KeyboardInterrupt: raise except Exception as e: pa += 1 t0 = time.time() print('Transforming key dict ... ', end='', flush=True)