def main(): # define arg parser parser = argparse.ArgumentParser() parser.add_argument('--source_path', required=True) parser.add_argument('--dest_path', default='da_movs.feather') args = vars(parser.parse_args()) # define path PROJ = Path() DATA = PROJ / 'dados' RAW = DATA / 'brutos' PROCESSED = DATA / 'processados' # load main data dados = pd.read_feather(args['source_path']) dados = dados.drop_duplicates('file_json') paths = list(set(dados['file_json'].str[3:].to_list())) paths = random.sample(paths, 1) # execute loops to read, process and join files kwargs = {'n_jobs': -2, 'verbose': 10} movs = Parallel(**kwargs)(delayed(parse_lawsuits)(p) for p in paths) movs = pd.concat(movs, ignore_index=True) movs = movs.astype(str) movs.to_feather(Path(args['dest_path']))
def spectral(x, sf, f, stype, dcomplex, cycle, width, n_jobs): """Extract spectral informations from data. Parameters ---------- x : array_like Array of data sf : float Sampling frequency f : array_like Frequency vector of shape (N, 2) stype : string Spectral informations to extract (use either 'pha' or 'amp') dcomplex : string Complex decomposition type. Use either 'hilbert' or 'wavelet' cycle : int Number of cycles to use for fir1 filtering. width : int Width of the wavelet. n_jobs : int Number of jobs to use. If jobs is -1, all of them are going to be used. """ n_freqs = f.shape[0] # Filtering + complex decomposition : if dcomplex is 'hilbert': # get filtering coefficients b = [] a = np.zeros((n_freqs,), dtype=float) forder = np.zeros((n_freqs,), dtype=int) for k in range(n_freqs): forder[k] = fir_order(sf, x.shape[-1], f[k, 0], cycle=cycle) _b, a[k] = fir1(forder[k], f[k, :] / (sf / 2.)) b += [_b] # Filt each time series : xf = Parallel(n_jobs=n_jobs, **CONFIG['JOBLIB_CFG'])(delayed(filtfilt)( b[k], a[k], x, padlen=forder[k], axis=-1) for k in range(n_freqs)) # Use hilbert for the complex decomposition : xd = np.asarray(xf) if stype is not None: xd = hilbertm(xd) elif dcomplex is 'wavelet': f = f.mean(1) # centered frequencies xd = Parallel(n_jobs=n_jobs, **CONFIG['JOBLIB_CFG'])(delayed(morlet)( x, sf, k, width) for k in f) # Extract phase / amplitude : if stype is 'pha': return np.angle(xd).astype(np.float64) elif stype is 'amp': return np.abs(xd).astype(np.float64) elif stype is None: return xd.astype(np.float64)
def extract_all_class_features(dataset, n_jobs=1, stride=5, patch_size=10): """Extract masked features from all dataset images, return features and labels""" cns = [] labels = [] for (label, cls) in enumerate(dataset.classes): print 'Extracting masked CNs from class {}'.format(cls) hists = Parallel(n_jobs=n_jobs)(delayed(extract_masked_cns)(imname, maskname) for (imname, maskname) in dataset.get_class_images(cls)) hists = np.vstack(hists) labels.append(label * np.ones((len(hists),), dtype=np.float32)) cns.append(hists.astype(np.float32)) # Stack lists in numpy arrays. return (cns, labels)
def main(): # define paths PROJ = Path() DATA = PROJ / 'dados' RAW = DATA / 'brutos' PROCESSED = DATA / 'processados' # load digesto fpath = [PROCESSED / f'processos0{i}_movs.csv' for i in range(1, 4)] dados = pd.concat( [pd.read_csv(fp, low_memory=False, dtype=str) for fp in fpath], ignore_index=True) dados['numero_cnj'] = dados['numero_cnj'].str.replace(r'\-|\.', '') # load cnj inova inova = pd.read_feather(PROCESSED / 'da_basic_transform.feather') inova = inova[['file_json', 'rowid', 'numero']] inova = inova.dropna(subset=['numero']) # extrair os números cnj numero_cnj = dados['numero_cnj'].to_list() numero_cnj = set(numero_cnj) # filter inova lawsuits for which we can recover text inova = inova[inova['numero'].isin(numero_cnj)] dados = dados[dados['numero_cnj'].isin(inova['numero'])] # salvar as join keys e o banco do digesto inova.to_csv(PROCESSED / 'join_keys.csv', index=False) dados.to_csv(PROCESSED / 'movs_texto.csv', index=False, quoting=1) # produce list for extracting info from cnj inova inova = inova[['file_json', 'rowid']] inova = inova.groupby('file_json')['rowid'].apply(list) inova = inova.reset_index() inova['file_json'] = inova['file_json'].str[3:] inova = inova.itertuples(name=None, index=False) # execute loops to read, process and join files kwargs = {'n_jobs': -2, 'verbose': 10} movs = Parallel(**kwargs)(delayed(parse_lawsuits)(*p) for p in inova) movs = pd.concat(movs, ignore_index=True) movs = movs.astype(str) movs.to_csv(PROCESSED / 'movs_inova.csv', index=False, quoting=1)
def fit(i): target = y_train[:, i].toarray().ravel() if target.mean() == 0: return np.zeros((X_test.shape[0],)) - 1 d = LogisticRegression(max_iter=10) d.fit(X_train, target) return (d.predict_proba(X_test)[:, 1]) preds = Parallel(n_jobs=8, verbose=50)(delayed(fit)(i) for i in range(y_train.shape[1])) preds = np.vstack(preds).T # To reduce memory usage preds = preds.astype(np.float16) num = int(np.ceil(num_users * 0.05)) # Let's take not random users, but the ones who viewed a lot users = train.loc[mask_test].user_id.value_counts().index[:num] ans_inds = np.argsort(preds[users]) test_inds_dict = {k: list(ans_inds[i, -5:]) for i,k in enumerate(users)} scorer(y_val_dict, test_inds_dict, num_users=num_users) # For each user find the categories, which we do not want to predict last_3weeks = train.loc[mask_test].loc[train.loc[mask_test].date >= train.loc[mask_test].date.max() - 21 + 1] y_not = last_3weeks.groupby('user_id').id3.apply(set) y_pred = {}
def task7(args): """ Core logic for task7 """ num_cores = multiprocessing.cpu_count() k = int(args.k) tensorFileName = "userImageLocation-tensor.npy" factorMatricesFileName = "factor-matrices" + str(k) + ".npy" cwd = os.getcwd() # Load each object space into dictionary where d_obj = {'id': {'term': df, ...}, ...} print("Loading User Space...") userFile = '../Data/devset_textTermsPerUser.txt' d_user = read_text_descriptor_files(userFile) print("Loading Image Space...") imageFile = '../Data/devset_textTermsPerImage.txt' d_images = read_text_descriptor_files(imageFile) print("Loading Location Space...") locFile = '../Data/devset_textTermsPerPOI.txt' d_locations = read_text_descriptor_files(locFile) user_list = list(d_user.keys()) image_list = list(d_images.keys()) loc_list = list(d_locations.keys()) print(len(user_list), len(image_list), len(loc_list)) if os.path.exists(cwd + '/' + tensorFileName): print("Loading Tensor...") tensor = np.load(tensorFileName) else: print("Creating Tensor...") def processInput(i): # Create a slice of 3-D tensor (combinations of loc & image per user) print('Started for user' + str(i)) user = user_list[i] array = [[0 for _ in range(len(loc_list))] for _ in range(len(image_list))] for j in range(len(image_list)): for l in range(len(loc_list)): image = image_list[j] loc = loc_list[l] # Number of terms shared by all three entities union_words = d_user[user].keys() & d_images[image].keys( ) & d_locations[loc].keys() array[j][l] += len(union_words) print('Ended for user' + str(i)) return array tensor = Parallel(n_jobs=num_cores - 1)(delayed(processInput)(i) for i in range(len(user_list))) tensor = np.array(tensor) print(tensor.shape) np.save(tensorFileName, tensor) print('Tensor created') if not os.path.exists(cwd + '/' + factorMatricesFileName): # Perform CP decomposition via ALS tensor = tensor.astype(float) print("Performing CP Decomposition...") factors = parafac(tensor=tensor, rank=k, n_iter_max=150, init='random') np.save(factorMatricesFileName, factors) else: factors = np.load(factorMatricesFileName) print("Factor Matrices created") indexToSpaceIds = {0: user_list, 1: image_list, 2: loc_list} def createGroups(factor_index): # Create k non-overlapping groups f_matrix = factors[factor_index] # factor matrix to be used groups = [] for i in range(k): groups.append([]) for j in range(f_matrix.shape[0]): # Assign object to one of k groups/latent-features that it has highest membership towards object_id = indexToSpaceIds[factor_index][ j] # Map indices back to user/image/location id's group_index = np.argmax(f_matrix[j]) groups[group_index].append(object_id) return groups groupsList = Parallel(n_jobs=num_cores - 1)(delayed(createGroups)(i) for i in [0, 1, 2]) # Output Results with open("task7_output.txt", "w") as f: userGroups = groupsList[0] f.write("\n********** K-USER GROUPS **********\n") for i in range(k): printGroups(userGroups, i, f) imageGroups = groupsList[1] f.write("\n********** K-IMAGE GROUPS **********\n") for i in range(k): printGroups(imageGroups, i, f) locGroups = groupsList[2] f.write("\n********** K-LOCATION GROUPS **********\n") for i in range(k): printGroups(locGroups, i, f)
def inferred_bsa(job, dataset_name, cdd, cores=NUM_WORKERS): job.log("INF CDD {}".format(cdd)) cdd_bsa_path = os.path.join(get_interfaces_path(dataset_name), "by_superfamily", str(int(cdd)), str(int(cdd))) if not os.path.isfile(cdd_bsa_path + "_bsa.h5"): job.log("observed bsa must exist") return print("Reading obs bsa") store = pd.HDFStore(unicode(cdd_bsa_path + "_bsa.h5")) # if "/inferred" in store.keys(): # return try: cdd_obs_bsa = store.get("/observed") except KeyError: raise RuntimeError("Must calculate observed BSAs first") try: cdd_obs_bsa = cdd_obs_bsa[[ "obs_int_id", "bsa", "c1_asa", "c2_asa", "face1_asa", "face2_asa", "complex_asa", "ppi_type" ]] except KeyError: job.log("Failed due to column select {}".format(cdd_obs_bsa.columns)) raise cdd_obs_bsa = cdd_obs_bsa.rename( columns={ "obs_int_id": "nbr_obs_int_id", "bsa": "obs_bsa", "c1_asa": "c1_asa_obs", "face1_asa": "face1_asa_obs", "complex_asa": "complex_asa_obs", "ppi_type": "ppi_type_obs" }) inf_interactome_path = unicode(cdd_bsa_path + ".inferred_interactome") try: print("Reading inf interactome") int_store = pd.HDFStore(unicode(cdd_bsa_path + ".inferred_interactome")) if "/table" not in int_store.keys(): return m = re.search("nrows->(\d+)", int_store.info()) if not m: int_store.close() job.log("Unable to read inferred interactome") return if int(m.group(1)) > 1000000: int_store.close() return inferred_bsa_dask(cdd_obs_bsa, cdd_bsa_path) inf_interactome = int_store.get( "/table" ) #pd.read_hdf(unicode(cdd_bsa_path+".inferred_interactome"), "table").reset_index() except MemoryError: return inferred_bsa_dask(cdd_obs_bsa, cdd_bsa_path) if inf_interactome.shape[0] > 1000000: int_store.close() del inf_interactome return inferred_bsa_dask(cdd_obs_bsa, cdd_bsa_path) inf_interactome = pd.merge(inf_interactome, cdd_obs_bsa, how="left", on="nbr_obs_int_id") #Remove redundant interfaces inf_interactome = inf_interactome.groupby( ["mol_sdi", "nbr_obs_int_id", "mol_sdi_from", "mol_sdi_to"], as_index=False).nth(0).reset_index(drop=True).copy() bsa = Parallel(n_jobs=NUM_WORKERS)(delayed(get_asa)(group) for _, group in \ inf_interactome.groupby(["mol_sdi", "nbr_obs_int_id"], as_index=False)) bsa = pd.concat(bsa, axis=1).T bsa = bsa.astype({ "mol_sdi": np.float64, "nbr_obs_int_id": np.float64, "c1_asa": np.float64, "face1_asa": np.float64, "bsa": np.float64, "complex_asa": np.float64, "pred_ratio": np.float64, "ppi_type": str }) inf_interactome = pd.merge(inf_interactome, bsa, how="left", on=["mol_sdi", "nbr_obs_int_id"]) inf_interactome.to_hdf(unicode(cdd_bsa_path + "_bsa.h5"), "inferred", format='table', append=True, complevel=9, complib="bzip2") print(unicode(cdd_bsa_path + "_bsa.h5")) int_store.close()
def observed_bsa(job, dataset_name, cdd, cores=NUM_WORKERS): job.log("CDD {}".format(cdd)) prefix = os.path.join(get_interfaces_path(dataset_name), "by_superfamily", str(int(cdd)), str(int(cdd))) # if os.path.isfile(prefix+"_bsa.h5"): # store = pd.HDFStore(unicode(prefix+"_bsa.h5")) # if "/observed" in store.keys(): # store.close() # return # store.close() cdd_interactome_path = prefix + ".observed_interactome" cdd_interactome = pd.read_hdf(unicode(cdd_interactome_path), "table") if cdd_interactome.shape[0] == 0: job.log("CDD observed interactome is empty -- FIX!!!") return if cdd_interactome.shape[0] == 0: job.log( "CDD observed interactome contains intra-chain PPI, skipped -- FIX!!!" ) return #Remove redundant interfaces cdd_interactome = cdd_interactome.groupby( ["obs_int_id", "mol_sdi_from", "mol_sdi_to"], as_index=False).nth(0).reset_index(drop=True).copy() if "mol_sdi" in cdd_interactome: key = "mol_sdi" elif "mol_sdi_id" in cdd_interactome: key = "mol_sdi_id" else: raise RuntimeError("sdi not in df") bsa = Parallel(n_jobs=NUM_WORKERS)(delayed(get_bsa)(group) for _, group in \ cdd_interactome.groupby(key, as_index=False)) bsa = pd.concat(bsa, axis=1).T bsa[key] = bsa[key].astype(int) bsa = bsa.astype({ "bsa": np.float64, "c1_asa": np.float64, "c2_asa": np.float64, "complex_asa": np.float64, "face1_asa": np.float64, "face2_asa": np.float64, "ppi_type": str }) cdd_interactome = pd.merge(cdd_interactome, bsa, how="left", on=key) cdd_interactome.to_hdf(unicode(prefix + "_bsa.h5"), "observed", table=True, format='table', complevel=9, complib="bzip2") print(unicode(prefix + "_bsa.h5"))