def tag_lang(data, txt_var='text_clean'): """ Tag language in all text in data, return language, score and post IDs. :param data: data frame :param txt_var: text var :returns lang_id_data:: lang, score and post ID """ lang_id_model = LanguageIdentifier.from_modelstring(model, norm_probs=True) # parallel MAX_JOBS=5 pandarallel.initialize(nb_workers=MAX_JOBS) lang_score_vals = data.loc[:, txt_var].parallel_apply(lang_id_model.classify) # serial # TODO: why does langid wreck CPU use? # lang_score_vals = data.loc[:, txt_var].apply(lang_id_model.classify) # separate lang/score lang_val, lang_score = zip(*lang_score_vals) lang_var = 'lang' lang_score_var = 'lang_score' post_id_var = 'id' data = data.assign(**{ lang_var : lang_val, lang_score_var : lang_score, }) lang_id_data = data.loc[:, [lang_var, lang_score_var, post_id_var]] return lang_id_data
def plot_PM_vs_sent_all_sites(master_csv): ''' Computes and plots the Pearson coefficient of PM2.5 vs. sentinel band means across all entries in the master .csv (i.e. all readings at all sites). ''' pandarallel.initialize() df = pd.read_csv(master_csv) subdir = "visuals/repaired/allsites/" for band in [0]: #range(0,13): label = 'b' + str(band+1)+ ' mean' df[label] = df.parallel_apply(get_sentinel_band_mean, band=band, axis=1) pearson = pearsonr(df[label], df['Daily Mean PM2.5 Concentration']) mean_day_diff = df['PM Reading/Image day difference'].mean() if np.isnan(pearson[0]): continue print("Band {} pearson: {}. Day difference: {}".format(band, pearson, mean_day_diff)) pearson_str = "r = " + str(round(pearson[0], 3)) plt.scatter(df[label], df['Daily Mean PM2.5 Concentration'], s=1, label = pearson_str) plt.xlabel("Sentinel Band " + str(band) + " Mean Value") plt.ylabel("PM2.5 Concentration") plt.title("Sentinel Band " + str(band) + " Mean vs. PM2.5 value across all sites", fontsize=16) plt.legend(loc='upper left') plt.savefig(subdir + "PM_vs_sent_band_" +str(band)+"_mean_for_all_sites.png") plt.show() plt.clf() print("Pearson across dataset: {}".format(pearson))
def calc_all_embeddings(df, model, cfg): """Calculate all document embeddings.""" # pick representative sentences from documents print("[clustering] choosing representative sentences...") if cfg.rep_sents_path != "": rep_sents = np.load(cfg.rep_sents_path, allow_pickle=True) elif cfg.num_workers == 1: rep_sents = df.apply(pick_rep_sentences, axis=1).to_numpy() else: pandarallel.initialize(nb_workers=cfg.num_workers) rep_sents = df.parallel_apply(pick_rep_sentences, axis=1).to_numpy() np.save("model/clustering/rep_sents.npy", rep_sents, allow_pickle=True) # finetune if cfg.sbert_finetune: print(f"[clustering] finetuning the model...") finetune_sbert(model, df, rep_sents, cfg.sbert_finetune_cfg) # calculate document embeddings print("[clustering] calculating document embeddings...") if cfg.doc_embs_path != "": doc_embs = np.load(cfg.doc_embs_path, allow_pickle=False) else: rs_tqdm = tqdm(rep_sents, position=0) doc_embs = np.array([embed_doc(rs, model) for rs in rs_tqdm]) np.save("model/clustering/doc_embs.npy", doc_embs, allow_pickle=False) return doc_embs
def transform(test_size: int) -> None: """Transform category classification dataset into fastText format. The original CSV file is split into *.train and *.test files where each file contains lines in form of '__label__<actual_label> sample text'. """ pandarallel.initialize() cat_df = pd.read_csv(paths.CATEGORY_DATA_PATH) cat_df = cat_df.sample(frac=1) # Transform category dataset into fastText format. lines = cat_df.parallel_apply( lambda ad: f"__label__{ad['label']} {ad['sample']}", axis=1) test_index = int(test_size * len(lines)) test_data = lines[:test_index] train_data = lines[test_index:] logger.info( f"Transformed {len(train_data)} train and {len(test_data)} test samples" ) with open(paths.FASTTEXT_CATEGORY_TRAIN_PATH, "w") as fp: fp.writelines("\n".join(train_data)) logger.info( f"Saved train dataset to '{paths.FASTTEXT_CATEGORY_TRAIN_PATH}'") with open(paths.FASTTEXT_CATEGORY_TEST_PATH, "w") as fp: fp.writelines("\n".join(test_data)) logger.info( f"Saved test dataset to '{paths.FASTTEXT_CATEGORY_TEST_PATH}'")
def compute_BM25(corpus_df: pd.DataFrame, query_df: pd.DataFrame, data_col: str, f_name: str, reindex: False) -> np.array: pandarallel.initialize() base_path = "/lfs/1/sahaana/enrichment/data/Okapi25Queries" corpus = list(corpus_df[data_col].parallel_apply(lambda x: x.split())) indexed = BM25Okapi(corpus) bm25 = query_df[data_col].parallel_apply( lambda x: indexed.get_scores(x.split())) bm25 = np.vstack(bm25) np.save(f"{base_path}/{f_name}.npy", bm25) final = np.argsort(bm25, axis=1) if not reindex: np.save(f"{base_path}/{f_name}_argsort.npy", final) print(f"Saved {f_name}") return final else: corpus_indexes = np.array(corpus_df.index) query_index = np.array(query_df.index) final = corpus_indexes[final] np.save(f"{base_path}/{f_name}_argsort.npy", final) np.save(f"{base_path}/{f_name}_QIDs.npy", query_index) print(f"Saved {f_name}") return query_index, bm25, final
def extract_features(df_transactions: pd.DataFrame, model_folder: str, progress_bar: bool = False) -> pd.DataFrame: from .feng_utils import calc_features from pandarallel import pandarallel pandarallel.initialize(progress_bar=progress_bar) ttype_direction_mapping, mcc_group_mapping = joblib.load( str(Path(model_folder) / "ttype_mcc_group_mappings.pkl")) ttypes = [*ttype_direction_mapping] mcc_groups = [*mcc_group_mapping] df_features = (df_transactions.groupby(["user_id"]).parallel_apply( lambda group: calc_features(group, ttypes, mcc_groups)).apply( pd.Series)) returning_user_lookup_table_df = pd.read_csv( str(Path(model_folder) / "returning_user_lookup_table_df.csv")) returning_user_lookup_table_df.set_index("user_id", inplace=True) df_features = df_features.join(returning_user_lookup_table_df) df_features["is_new_customer"] = df_features[ "prev_monthly_in_flow_avg"].isna() inflow_model_features = joblib.load( str(Path(model_folder) / "inflow_model_features.pkl")) outflow_model_features = joblib.load( str(Path(model_folder) / "outflow_model_features.pkl")) extracted_features = df_features.columns assert set(extracted_features) >= set(inflow_model_features) and set( extracted_features) >= set(outflow_model_features) return df_features
def transform(self, corpus): ''' Inputs: corpus A list of sequences. Each sequence is a list of alphabets. ''' ''' Difference between fit_transform and transform is: In transform() we have the alphabets already known. In fit_transform() is alphabets are not known, they are computed. The computation in fit is essentially getting the alphabets set. ''' if self.mode == 'default': sgt = corpus.apply( lambda x: [x['id']] + list(self.fit(x['sequence'])), axis=1, result_type='expand') sgt.columns = ['id'] + self.feature_names return sgt elif self.mode == 'multiprocessing': # Import from pandarallel import pandarallel # Initialization pandarallel.initialize(nb_workers=self.processors) sgt = corpus.parallel_apply( lambda x: [x['id']] + list(self.fit(x['sequence'])), axis=1, result_type='expand') sgt.columns = ['id'] + self.feature_names return sgt
def cylinder_fit(self): print('running new version') # if 'sf_radius' in self.centres.columns: # del self.centres['sf_radius'] for c in self.centres.columns: if 'sf' in c: del self.centres[c] node_id = self.centres[self.centres.n_points > self.min_pts].sort_values( 'n_points').node_id.values groupby_ = self.pc.loc[self.pc.node_id.isin(node_id)].groupby('node_id') pandarallel.initialize(progress_bar=True, verbose=2) cyl = groupby_.parallel_apply(RANSAC_helper) # cyl = groupby_.apply(RANSAC_helper) cyl.columns = ['sf_radius', 'centre'] cyl.reset_index(inplace=True) cyl.loc[:, 'sf_cx'] = cyl.centre.apply(lambda c: c[0]) cyl.loc[:, 'sf_cy'] = cyl.centre.apply(lambda c: c[1]) cyl.loc[:, 'sf_cz'] = cyl.centre.apply(lambda c: c[2]) self.centres = pd.merge( self.centres, cyl[['node_id', 'sf_radius', 'sf_cx', 'sf_cy', 'sf_cz']], on='node_id', how='left')
def customer_file_parse(args, input_sentence): nltk.download('stopwords') nltk.download('punkt') if args.paraphrase_corpus and args.input_file_path: output_path = os.path.join(args.data_dir, 'new_test.tsv') print('File reading...') input_file_path = args.input_file_path corpus_sentences = [] data = pd.read_csv(input_file_path, sep='\t') pandarallel.initialize() groups = data.groupby("#2 String") num = groups.count()['#1 ID'].iloc[0] g = groups.parallel_apply(group_func).set_index( 'Quality').reset_index() if num > 10: recall = g['recall'].sum() / groups.ngroups print("tf-idf recall: " + str(recall)) g.to_csv(output_path, index=False, sep='\t') return corpus_sentences
def init_okapi25(): corpus = list(self.df_r[data_col].apply(lambda x: x.split())) indexed = BM25Okapi(corpus) pandarallel.initialize() bm25 = self.df_r[data_col].parallel_apply( lambda x: indexed.get_scores(x.split())) return np.argsort(bm25, axis=1)
def build_people_using_nhanes_for_sampling(nhanes, n, outcome_model_repository, filter=None, random_seed=None, weights=None): if weights is None: weights = nhanes.WTINT2YR repeated_sample = nhanes.sample(n, weights=weights, random_state=random_seed, replace=True) pandarallel.initialize(verbose=1) people = repeated_sample.parallel_apply( build_person, outcome_model_repository=outcome_model_repository, axis="columns") for i in range(0, len(people)): people.iloc[i]._populationIndex = i if filter is not None: people = people.loc[people.apply(filter)] return people
def fit_transform(self, corpus): ''' Inputs: corpus A list of sequences. Each sequence is a list of alphabets. ''' if (len(self.alphabets) == 0): self.alphabets = self.estimate_alphabets(corpus['sequence']) self.feature_names = self.__set_feature_names(self.alphabets) if self.mode == 'default': sgt = corpus.apply( lambda x: [x['id']] + list(self.fit(x['sequence'])), axis=1, result_type='expand') sgt.columns = ['id'] + self.feature_names return sgt elif self.mode == 'multiprocessing': # Import from pandarallel import pandarallel # Initialization pandarallel.initialize(nb_workers=self.processors) sgt = corpus.parallel_apply( lambda x: [x['id']] + list(self.fit(x['sequence'])), axis=1, result_type='expand') sgt.columns = ['id'] + self.feature_names return sgt
def skeleton(self, eps): # run pandarallel on groups of points groupby = self.pc.groupby('slice_id') pandarallel.initialize(nb_workers=min(24, len(groupby)), progress_bar=False) sent_back = groupby.parallel_apply(find_centre, self, eps).values # create and append clusters and filtered pc self.centres = pd.DataFrame() self.pc = pd.DataFrame() for x in sent_back: self.centres = self.centres.append(x[0]) self.pc = self.pc.append(x[1]) # reset index as appended df have common values self.centres.reset_index(inplace=True) self.pc.reset_index(inplace=True) # convert binary cluster reference to int MAP = {v: i for i, v in enumerate(self.centres.idx.unique())} if 'level_0' in self.pc.columns: self.pc = self.pc.drop(columns='level_0') if 'index' in self.pc.columns: self.pc = self.pc.drop(columns='index') self.pc.loc[:, 'node_id'] = self.pc.idx.map(MAP) self.centres.loc[:, 'node_id'] = self.centres.idx.map(MAP)
def compute_true_month_averages(master_csv, true_averages_csv): ''' Computes the ground truth PM monthly averages from daily labels from a given master csv file and saves to the provided true_averages_csv file. ''' pandarallel.initialize() df = pd.read_csv(master_csv) # Index on 'Month' and 'Site Id' to compute averages at each station for the month months = df.parallel_apply(get_month, axis=1) df['Month'] = months epa_stations = df['Site ID'].unique() num_sites = len(epa_stations) with open(true_averages_csv, 'a') as fd: writer = csv.writer(fd) writer.writerow(["Site ID", "Month", "Month Average"]) for i, station_id in enumerate(epa_stations): station_datapoints = df[df['Site ID'] == station_id] for month in range(1, 13): month_m_at_station_i = station_datapoints[ station_datapoints['Month'] == month] pms_for_month_m_at_station_i = month_m_at_station_i[ 'Daily Mean PM2.5 Concentration'] month_average = np.mean(pms_for_month_m_at_station_i) row = [station_id, month, month_average] writer.writerow(row)
def create_pretokenized_dataset(): logger = logging.getLogger(__name__) logger.info('making final data set from raw data') # Use every core on the machine. pandarallel.initialize(use_memory_fs=False) config = deepLegisConfig("bert_128.json") # Create a dataframe out of the ml_data.csv by adding the text to it. df, _ = createDeepLegisDataFrame(config, read_cached=False) # Take the text and tokenize it into the final product the model wants to see. tokenizer = config.tokenizer def tokenizer_wrapper(text): d = tokenizer(text, truncation=True, padding='max_length', max_length=config.max_length) return (d['input_ids']) tic = time.perf_counter() df['tokens'] = df.text.parallel_apply(tokenizer_wrapper) toc = time.perf_counter() logger.info( f"Tokenized in {(toc-tic)/60.0} min - {toc - tic:0.4f} seconds") print(df.head()) # Save it for later use pickle_file = config.data_vol + "preprocessed_df_128.pkl" pickle.dump(df, open(pickle_file, "wb"))
def main(): out_path = "/bigtemp/rm5tx/nlp_project/2016-06_all_predicted.csv" model = ProjModel.load_from_checkpoint( checkpoint_path=os.path.expanduser("~/saved_models/last.ckpt")) tprint("Model Loaded") DATA_PATH = os.path.expanduser("~/data_cache/") data = ProjData(max_len=128) data.load(DATA_PATH) tprint("Data Loaded") neg_data = get_dataset("/localtmp/rm5tx/2016-06_all.csv") # pool = mp.Pool(processes = (15) pandarallel.initialize(nb_workers=18) tprint("Starting Tokenize") neg_data['tokenized'] = neg_data['data'].parallel_map( lambda x: tokenize(data.max_len, data.tokenizer, x)) tprint("Finished Tokenize") torch.save(neg_data, open(DATA_PATH + "df_to_be_inferred.pt", "wb")) neg_data.to_csv(DATA_PATH + "df_to_be_inferred.csv") inputs = torch.tensor(neg_data['tokenized']) masks = inputs.ne(0) neg_data.to_csv(DATA_PATH + "df_to_be_inferred.csv") torch.save(inputs, open(DATA_PATH + "inputs_to_be_inferred.pt", "wb")) tprint("Saved Inputs") labels = [] masked_input = TensorDataset(inputs, masks) dataloader = DataLoader(masked_input, batch_size=1000) model.eval() for batch in dataloader: b_input, b_mask = batch #print(b_input.shape) #print(b_mask.shape) #print(model(b_input,b_mask).shape) labels.extend(model(b_input, b_mask).tolist()) #print(labels) #model.eval() #print(type(model)) #print(model(inputs,masks).shape) #sents = neg_data['data'].tolist() #sents = ["random sentence", "pretty flowers", "idiot", "f**k you c**t nigger"] #xs,masks = data.process(sents) #for sent in sents: # x, mask = data.process(sent) #print(sent,' ',model(x, mask).item()) # labels.append(model(x, mask).item()) neg_data["label"] = pd.Series(labels) #print(neg_data[["data","label","author"]]) neg_data.to_csv(out_path)
def main(current_price_strategy: str, price_update_strategy: str, market_extract_path: Path, output_path: Path, force_update: bool): set_log_conf(log_path=os.getcwd()) pandarallel.initialize(progress_bar=True) logger = logging.getLogger(__name__) logger.info("Starting run") client = CardMarketClient() current_price_computer = CurrentPriceComputer( strategy_name=current_price_strategy) price_updater = PriceUpdater(strategy_name=price_update_strategy) # Get the stock as a dataframe stock_df = client.get_stock_df() # Set the market extract path set_market_extract_path(market_extract_parent_path=market_extract_path) # Load the saved product prices reset_market_extract(force_update=force_update) _get_product_market_extract = partial(get_single_product_market_extract, card_market_client=client) def get_product_price(product_id): market_extract = _get_product_market_extract(product_id=product_id) try: return current_price_computer.get_current_price_from_market_extract( market_extract=market_extract) except SuitableExamplesShortage: # We try with a larger request in case of a lack of suitable examples market_extract = _get_product_market_extract(product_id=product_id, max_results=500) try: return current_price_computer.get_current_price_from_market_extract( market_extract=market_extract) except SuitableExamplesShortage: return float("nan") # Put the product prices in the df try: stock_df["ActualPrice"] = stock_df["idProduct"].parallel_apply( get_product_price) except Exception as error: logger.error(error) raise finally: stock_df.to_csv(output_path / "stock.csv") # Computes the new price stock_df = price_updater.update_df_with_new_prices(stock_df=stock_df) # Saves the result stock_df.to_csv(output_path / "stock.csv") # Saves only the updated prices separately stock_df.loc[~pd.isna(stock_df["NewPrice"])].to_csv(output_path / "updated_stock.csv") logger.info("End of the run")
def init(): pandarallel.initialize() our_domain = 'localhost:7070' locator = Nominatim(domain=our_domain, scheme='http', user_agent="myGeocoder2") geocode = RateLimiter(locator.reverse, min_delay_seconds=0.3) return geocode
def checkOS(): if platform == "linux" or platform == "linux2" or platform == "darwin": print("Found *NIX like System.") from pandarallel import pandarallel #import when we need it pandarallel.initialize(nb_workers=CORES, verbose=0) isUnix = True else: print("Found Non-*nix like System.") isUnix = False
def create_non_tda_features(path, fourier_window_size=[], rolling_mean_size=[], rolling_max_size=[], rolling_min_size=[], mad_size=[], fourier_coefficients=[]): """ INPUT: path: int (number to OpenML dataset) fourier_window_size: a list of window sizes. Note: min must be > max(fourier_coefficients) rolling_mean_size: a list of window sizes rolling_max_shift: a list of window sizes rolling_min_shift: a list of window sizes mad_size: a list of window sizes fourier_coefficients: a list of all fourier coefficients to include. Note: max must be < min(fourier_window_size) OUTPUT: df: pandas dataframe with columns: max_... for rolling max features min_... for rolling min features mean_... for rolling mean features mad_... for rolling mad features fourier_... for fourier coefficients """ df = get_dataset(path) df = df.get_data()[0] df.rename({ 'label': 'y', 'coord_0': 'x', 'coord_1': 'x_dot' }, axis='columns', inplace=True) pandarallel.initialize() for r in rolling_max_size: df['max_' + str(r)] = df['x'].rolling(r).max() for r in rolling_mean_size: df['mean_' + str(r)] = df['x'].rolling(r).mean() for r in rolling_min_size: df['min_' + str(r)] = df['x'].rolling(r).min() for r in mad_size: df['mad_' + str(r)] = df['x'] - df['x'].rolling(r).min() if (not fourier_coefficients and fourier_window_size) or (not fourier_window_size and fourier_coefficients): print('Need to specify the fourier coeffcients and the window size') for n in fourier_coefficients: df[f'fourier_w_{n}'] = df['x'].rolling( fourier_window_size).parallel_apply(lambda x: rfft(x)[n], raw=False) # Remove all rows with NaNs df.dropna(axis='rows', inplace=True) return df
def add_lang(dataframe): ''' add language as a new column [dataframe] : pandas dataframe ''' df_copy = deepcopy(dataframe) pandarallel.initialize(progress_bar=True) df_copy['lang'] = df_copy['text'].parallel_apply(detect_lang) return df_copy
def process_aquifer_shapefile(shapefile: Any, region_id: int, name_attr: str, id_attr: str, app_workspace: Any) -> Dict: """ Process uploaded auifer shapefile Args: shapefile: List of shapefile files region_id: Region id as listed in the database name_attr: Aquifer Name Column id_attr: Aquifer Id Column app_workspace: Temp App workspace Returns: Response dict with success or error string """ session = get_session_obj() temp_dir = None def add_aquifer_apply(row): aquifer = Aquifer(region_id=region_id, aquifer_name=row.aquifer_name, aquifer_id=row.aquifer_id, geometry=row.geometry) return aquifer try: start_time = time.time() pandarallel.initialize() gdf, temp_dir = get_shapefile_gdf(shapefile, app_workspace) gdf = gdf.dissolve(by=name_attr, as_index=False) # gdf.to_csv('texas_aquifers.csv') rename_cols = {name_attr: 'aquifer_name', id_attr: 'aquifer_id'} gdf.rename(columns=rename_cols, inplace=True) gdf = gdf[['aquifer_name', 'aquifer_id', 'geometry']] aquifer_list = gdf.parallel_apply(add_aquifer_apply, axis=1) session.add_all(aquifer_list) session.commit() session.close() end_time = time.time() total_time = (end_time - start_time) return {"success": "success", "total_time": total_time} except Exception as e: session.close() if temp_dir is not None: if os.path.exists(temp_dir): shutil.rmtree(temp_dir) return {"error": str(e)} finally: # Delete the temporary directory once the shapefile is processed if temp_dir is not None: if os.path.exists(temp_dir): shutil.rmtree(temp_dir)
def main(input_file, output_folder): pandarallel.initialize(progress_bar=True) output_folder = Path(output_folder) output_folder.mkdir(parents=True, exist_ok=True) def extract(file_name): return extract_meta_data(file_name, output_folder) file_names = pd.read_csv(input_file, header=0, names=['path'], index_col=None) file_names.path.parallel_map(extract)
def flip_reads(df, t=1): if t == 1: df['trim_seq'] = df.apply(flip_reads_x, axis=1) else: pandarallel.initialize(nb_workers=t) df['trim_seq'] = df.parallel_apply(flip_reads_x, axis=1) df.seq = df.trim_seq df.drop('trim_seq', axis=1, inplace=True) return df
def __init__(self, person, n): self._outcome_model_repository = OutcomeModelRepository() self._qaly_assignment_strategy = QALYAssignmentStrategy() self._risk_model_repository = CohortRiskModelRepository() self.n = n people = pd.Series([copy.deepcopy(person) for i in range(0, n)]) pandarallel.initialize(verbose=1) for i in range(0, len(people)): people.iloc[i]._populationIndex = i super().__init__(people)
def convert_to_ids(df, save_dir): global id_col global freq_bound global attribute_columns pandarallel.initialize() feature_columns = list(sorted(attribute_columns)) dict_DomainDims = {} col_val2id_dict = {} for col in feature_columns: vals = list(set(df[col])) vals = list(sorted(vals)) id2val_dict = {e[0]: e[1] for e in enumerate(vals, 0)} print(' > ', col, ':', len(id2val_dict)) val2id_dict = {v: k for k, v in id2val_dict.items()} col_val2id_dict[col] = val2id_dict # Replace df[col] = df.parallel_apply(replace_attr_with_id, axis=1, args=( col, val2id_dict, )) dict_DomainDims[col] = len(id2val_dict) print(' Feature columns :: ', feature_columns) print(' dict_DomainDims ', dict_DomainDims) # ------------- # Save the domain dimensions # ------------- file = 'domain_dims.pkl' if not os.path.exists(save_dir): os.mkdir(save_dir) f_path = os.path.join(save_dir, file) with open(f_path, 'wb') as fh: pickle.dump(dict_DomainDims, fh, pickle.HIGHEST_PROTOCOL) file = 'col_val2id_dict.pkl' f_path = os.path.join(save_dir, file) with open(f_path, 'wb') as fh: pickle.dump(col_val2id_dict, fh, pickle.HIGHEST_PROTOCOL) return df, col_val2id_dict
def get_features(peptide_list, index_id_list, n_cpu=1): if n_cpu != 1: pandarallel.initialize(nb_workers=n_cpu, verbose=0) index_data = load_index_data(index_id_list=index_id_list) peptide_df = pd.DataFrame() peptide_df['peptide'] = peptide_list if n_cpu == 1: features = peptide_df['peptide'].apply( lambda x: sequence_to_features(x, index_data)) else: features = peptide_df['peptide'].parallel_apply( lambda x: sequence_to_features(x, index_data)) return pd.DataFrame(features.tolist())
def cal_cluster(cluster_num=500): """ calculate the bovw clustering centres """ pandarallel.initialize(nb_workers=50, use_memory_fs=False) train_list = pd.read_csv('train_list.csv') bag_of_features = [] features = train_list.parallel_apply(cal_descriptors, axis=1) for f in features: bag_of_features += f clusters = kmeans(np.array(bag_of_features).astype('float32'), 500, initialization="PLUSPLUS") # kmean cluster return clusters
def parse_multiple(self, df, multiproc=False): """ Parses elements and puts them in a dataframe :param df: Pandas dataframe :param multiproc: Boolean. True activates multiprocessing :return: """ if multiproc: pandarallel.initialize() elements_df = df[self.text_col].parallel_apply(self.collect_parsed) else: elements_df = df[self.text_col].apply(self.collect_parsed) multiple_elements_df = pd.concat([df, elements_df], axis=1) return multiple_elements_df
def main(argv=None): """Loads the original corpora, applies normalization and caches the process in csv files.""" try: pandarallel.initialize(progress_bar=True, use_memory_fs=True) except SystemError: pandarallel.initialize(progress_bar=True) OUT_DIR.mkdir(exist_ok=True) for corpus in [TIGER, HDT]: t0 = time() df = get_original_corpus(corpus, print_sample=-50) print(f'Writing {FILES[PREPROCESSED](corpus)}') df.to_csv(FILES[PREPROCESSED](corpus), sep='\t', index=False) print(f'{corpus} done in {time() - t0:.2f}s\n')