def cache_bottlenecks(sess, image_files_metadata, jpeg_data_tensor, bottleneck_tensor, use_tqdm_notebook_widget=True): """Ensures all the training, testing, and validation bottlenecks are cached. Because we're likely to read the same image multiple times it can speed things up a lot if we calculate the bottleneck layer values once for each image during preprocessing, and then just read those cached values repeatedly during training. Here we go through all the images we've found, calculate those values, and save them off. Args: sess: The current active TensorFlow Session. image_files_metadata: dataframe of training images for each label. jpeg_data_tensor: Input tensor for jpeg data from file. bottleneck_tensor: The penultimate output layer of the graph. """ # still not sure how robust tqdm is. Maybe will use the 'old code' if use_tqdm_notebook_widget: tqdm_notebook.pandas(desc='Caching...') else: tqdm.pandas(desc='Caching...') alreadycached_first = image_files_metadata[ BOTTLENECK_DATAFRAME_KEYWORD].apply(os.path.isfile).sort_values( ascending=False ).index # this ensures that first the function will go through files that are already cached, so that the progress bar doesn't jump back and forth between slow and fast mode image_files_metadata.loc[alreadycached_first].progress_apply( lambda image_files_metadata_row: get_or_create_bottleneck( sess, image_files_metadata_row, jpeg_data_tensor, bottleneck_tensor ), axis=1)
def textrank(self): tqdm.pandas(desc='get textrank>>>') row_list = pd.Series(self.split_list).progress_apply( lambda x: [[self.dic.token2id[w[0]], w[1]] for w in self.get_textrank(x)]) # row_list = row_list.progress_apply(lambda x: [[self.dic.token2id[w[0]], w[1]] for w in x]) row_list = row_list.tolist() self.TEXTRANK_Vector = row_list
def run(self): if tools.isnotebook(): tqdm_notebook.pandas(desc="run ilastik") _ = self.df.progress_apply(self.process, axis=1) else: try: _ = self.df.apply(self.process, axis=1) except OSError: sys.exit(">>> Check ilastik path and config.ini.")
def predict( self, X): # Takes a series of text and returns a series of predictions if self.verbose: from tqdm._tqdm_notebook import tqdm_notebook tqdm_notebook.pandas() return X.progress_apply(self.predict_text_main) else: return X.apply(self.predict_text_main)
def calculate_toxicity(model, test_data): batch_size = 1 max_bert_length = 220 pytorch_conversion = False seed_everything(1235) device = torch.device('cpu') tqdm.pandas() bert_model_path = "./service/uncased_L-12_H-768_A-12/" base_tokenizer = BertTokenizer.from_pretrained(bert_model_path, cache_dir=None, do_lower_case=True) converted_text = convert_data(test_data, max_bert_length, base_tokenizer) bert_test_lengths = torch.from_numpy( np.array([len(x) for x in converted_text])) bert_test_set = torch.tensor(pad_sequences(converted_text, maxlen=max_bert_length, padding='post'), dtype=torch.long) bert_test_dataset = torch.utils.data.TensorDataset(bert_test_set) bert_test_loader = torch.utils.data.DataLoader(bert_test_dataset, batch_size=batch_size, shuffle=False) tk2 = tqdm(enumerate(bert_test_loader), total=len(bert_test_loader), leave=False) output_preds = [] for i, (batch) in tk2: tsrs = trim_tensors(batch) x_batch, = tuple(t.to(device) for t in tsrs) y_pred = model(x_batch.to(device), attention_mask=(x_batch > 0).to(device), labels=None) y_pred = torch.sigmoid( torch.tensor( y_pred[:, 0].detach().cpu().squeeze().numpy())).numpy().ravel() list.append(output_preds, y_pred) return output_preds
def load_cache_values(image_files_metadata, notebook=True, tqdm_desc='', project='jax-nihcc-res-00-0011', bucket_name=None, apply_func=lambda x: x.tostring(), user_project=None): assert bucket_name is not None, 'please provide bucket name' client = storage.Client(project=project) if user_project == None: user_project = project bucket = client.bucket(bucket_name, user_project=user_project) if notebook: tqdm_notebook.pandas(desc=tqdm_desc) else: tqdm.pandas(desc=tqdm_desc) cache_values = image_files_metadata['rel_path'].progress_apply( lambda x: get_blob_val(x, bucket, apply_func)) return cache_values
def assign_validation_and_other_labels_to_tiles( training_percentage=70, testing_percentage=15, outputfile=None, glob_path=None, lstrip_string=None, task_class_counts_dict={ 'mean_val': 2, 'std_val': 2 }, relative_path_keyword='rel_path', cache_gcs_paths=None, use_tqdm_notebook_widget=False, include_cache_stats=False, glob_locally=False, backward_count_to_samplename=3, drop_normals=False): ''' Randomly assigns slides to train/test/validation, and creates a dataframe with tile paths these corresponding labels. Furthermore it adds some basic annotations such as patient id, a random hash, tumor/normal status, and some optional basic statistics (e.g. mean, std) of the image. A field with relative path is also construced from each GCSurl for downstream analysis. NOTE: calculating image statistics has been implemented for caches (tsv files), and needs to be implemented for JPEG files (see GH-57). Arguments: training_percentage (float): Percentage of samples in the training set testing_percentage (float): Percentage of samples in the test set outputfile (str): File to save the output dataframe. If None it will not save the output glob_path (str): glob path only used when locally reading the files. Works if glob_locally=True (default:None) lstrip_string: Prefix to be removed from GCSurl in order to create relative paths. In most cases it can be the bucket name. task_class_counts_dict (dict): Number of classes in the optional statistics. This is used to split the statistics into percentiles and label each with unique integers (0, 1, ...). Default: {'mean_val':2, 'std_val':2} relative_path_keyword (str): Column name used for the relative paths constructed from GCSurl (default: 'rel_path') cache_gcs_paths (str or pd.DataFrame): the list of GCSurls for all the tiles to be annotated. If this is a text file where each row is a GCSurl then the argument would be path of to that text file. Alternatively it can be a dataframe with a similar structure. use_tqdm_notebook_widget (bool): flag to make tqdm work with notebooks (default: False) include_cache_stats (bool): set this to True in order to calculate tile statistics (e.g. image mean, std). Note that this process can be quite time-consuming (default: False). glob_locally (bool): The function is able to construct the annotations from local folder structure instead of GCS (default: False) Returns: cache_df (pandas.DataFrame): Dataframe containing GCSurls and their annotations. ''' validation_percentage = 100 - testing_percentage - training_percentage # GH-79 if use_tqdm_notebook_widget: tqdm_notebook.pandas(desc='') else: tqdm.pandas(desc='') if glob_locally: print('Globbing tiles locally...') cache_df = glob.glob(glob_path) cache_df = pd.DataFrame(cache_df, columns=[relative_path_keyword]) else: cache_df = util.read_csv(cache_gcs_paths, columns=['GCSurl'], sep=',', header=None) cache_df[relative_path_keyword] = cache_df['GCSurl'].progress_apply( lambda x: x[len(lstrip_string):]) print('Randomizing tiles...') cache_df = cache_df.sample(frac=1, random_state=0).reset_index(drop=True) print('Extracting sample ids...') cache_df['sample_id'] = cache_df[relative_path_keyword].progress_apply( lambda s: s.split('/')[-backward_count_to_samplename].split('.')[0]) print('Extracting patient ids...') cache_df['patient_id'] = cache_df['sample_id'].progress_apply( lambda x: x[:12]) print('Extracting tumor/normal label...') cache_df['is_tumor'] = cache_df['sample_id'].progress_apply( lambda s: int(s[13:15]) < 10).astype(int) assert validation_percentage + testing_percentage < 100, "There are not enough training samples" if drop_normals: print('Dropping normal samples from the list...') cache_df = cache_df[cache_df['is_tumor'] == 1] print('Extracting slide preparation method...') cache_df['slide_code'] = cache_df['sample_id'].map( lambda x: x.split('-')[-1]) cache_df['tissue-method'] = cache_df['slide_code'].map( lambda x: ['Frozen', 'FFPE'][x.startswith('DX')]) getSHA1 = lambda s: hashlib.sha1(compat.as_bytes(s)).hexdigest() print('Hashing sample ids...') cache_df['sample_id_SHA1'] = cache_df['sample_id'].progress_apply(getSHA1) assert not cache_df[['sample_id_SHA1', 'sample_id']].drop_duplicates( )['sample_id_SHA1'].duplicated().any(), "SHA1 produced duplicates!!!" # The 'crossval_group' assigned to tiles may highly correlated training and testing data # To avoid that let's re-assign each patient to one category (training,testing,validation): print('Assigning cross-validation labels to samples...') MAX_NUM_IMAGES_PER_CLASS = 2**27 - 1 # ~134M, need a huge number. cache_df['crossval_group'] = cache_df['sample_id_SHA1'].progress_apply( lambda x: (int(x, 16) % (MAX_NUM_IMAGES_PER_CLASS + 1) ) / MAX_NUM_IMAGES_PER_CLASS * 100) if validation_percentage > 0: cache_df['crossval_group'] = pd.cut( cache_df['crossval_group'], [ -1, testing_percentage, testing_percentage + validation_percentage, 100 ], labels=['testing', 'validation', 'training']) else: # ignore validation set cache_df['crossval_group'] = pd.cut(cache_df['crossval_group'], [-1, testing_percentage, 100], labels=['testing', 'training']) if include_cache_stats: def get_cache_stats(cache_filename): x = np.loadtxt(cache_filename, delimiter=',') cache_stats = {'mean_val': x.mean(), 'std_val': x.std()} return pd.Series(cache_stats) print('Calculate per tile statistics...') tmp = cache_df[relative_path_keyword].progress_apply(get_cache_stats) print('Merging the results...') cache_df = pd.concat([cache_df, tmp], axis=1) assert tmp.shape[1] == len( task_class_counts_dict ), "The number of tasks needs to match the number of fields produced by get_cache_stats" print('Creating tile stat labels...') for label, class_count in task_class_counts_dict.items(): cache_df[label + '_label'] = pd.qcut(cache_df[label], class_count, labels=False) if outputfile is not None: print('Saving tile dataframe to disk...') cache_df.to_csv(outputfile, index=False) print('Saved to: {:s}'.format(outputfile)) return cache_df
def label_cache_files( validation_percentage=15, testing_percentage=15, outputfile='data/cache_dataframe.txt', task_class_counts_dict={ 'mean_val': 2, 'std_val': 2 }, glob_path='tcga_tiles/luad/filelist_luad_40x_level2_downsampl16_512x512_cache/*/tiles/tile*.jpg_cached.txt', lstrip_string='gs://histology/', BOTTLENECK_DATAFRAME_KEYWORD='rel_path', cache_gcs_paths='data/filelist_luad_40x_level2_downsampl16_512x512_cache.txt', use_tqdm_notebook_widget=True, include_cache_stats=False, glob_locally=False): if use_tqdm_notebook_widget: tqdm_notebook.pandas(desc='') else: tqdm.pandas(desc='') if glob_locally: print('Globbing tile caches...') cache_df = glob.glob(glob_path) cache_df = pd.DataFrame(cache_df, columns=[BOTTLENECK_DATAFRAME_KEYWORD]) else: print('Fetching cache filenames...') cache_df = pd.read_csv(cache_gcs_paths, sep=',', header=None) cache_df.columns = ['GCSurl'] cache_df[BOTTLENECK_DATAFRAME_KEYWORD] = cache_df[ 'GCSurl'].progress_apply(lambda x: x[len(lstrip_string):]) print('Randomizing tiles...') cache_df = cache_df.sample(frac=1, random_state=0).reset_index(drop=True) print('Extracting sample ids...') cache_df['sample_id'] = cache_df[ BOTTLENECK_DATAFRAME_KEYWORD].progress_apply( lambda s: s.split('/')[-3].split('.')[0]) print('Extracting patient ids...') cache_df['patient_id'] = cache_df['sample_id'].progress_apply( lambda x: x[:12]) print('Extracting tumor/normal label...') cache_df['is_tumor'] = cache_df['sample_id'].progress_apply( lambda s: int(s[13:15]) < 10).astype(int) assert validation_percentage + testing_percentage < 100, "There are not enough training samples" getSHA1 = lambda s: hashlib.sha1(compat.as_bytes(s)).hexdigest() print('Hashing sample ids...') cache_df['sample_id_SHA1'] = cache_df['sample_id'].progress_apply(getSHA1) assert not cache_df[['sample_id_SHA1', 'sample_id']].drop_duplicates( )['sample_id_SHA1'].duplicated().any(), "SHA1 produced duplicates!!!" # The 'crossval_group' assigned to tiles may highly correlated training and testing data # To avoid that let's re-assign each patient to one category (training,testing,validation): print('Assigning cross-validation labels...') MAX_NUM_IMAGES_PER_CLASS = 2**27 - 1 # ~134M cache_df['crossval_group'] = cache_df['sample_id_SHA1'].progress_apply( lambda x: (int(x, 16) % (MAX_NUM_IMAGES_PER_CLASS + 1) ) / MAX_NUM_IMAGES_PER_CLASS * 100) if validation_percentage > 0: cache_df['crossval_group'] = pd.cut( cache_df['crossval_group'], [ -1, testing_percentage, testing_percentage + validation_percentage, 100 ], labels=['testing', 'validation', 'training']) else: # ignore validation set cache_df['crossval_group'] = pd.cut(cache_df['crossval_group'], [-1, testing_percentage, 100], labels=['testing', 'training']) if include_cache_stats: def get_cache_stats(cache_filename): x = np.loadtxt(cache_filename, delimiter=',') cache_stats = {'mean_val': x.mean(), 'std_val': x.std()} return pd.Series(cache_stats) print('Calculate per cache statistics...') tmp = cache_df[BOTTLENECK_DATAFRAME_KEYWORD].progress_apply( get_cache_stats) print('Merging the results...') cache_df = pd.concat([cache_df, tmp], axis=1) assert tmp.shape[1] == len( task_class_counts_dict ), "The number of tasks needs to match the number of fields produced by get_cache_stats" print('Creating cache stat labels...') for label, class_count in task_class_counts_dict.items(): cache_df[label + '_label'] = pd.qcut(cache_df[label], class_count, labels=False) print('Saving cache dataframe to disk...') cache_df.to_csv(outputfile, index=False) print('Saved to: {:s}'.format(outputfile)) return cache_df
def label_jpeg_files( task_class_counts_dict, validation_percentage, testing_percentage, glob_path='tcga_tiles/luad_40x_level_2_16/TCGA-*.svs/tiles/tile_*.jpg', cache_directory='tcga_tiles/luad_40x_level_2_16_cache', BOTTLENECK_DATAFRAME_KEYWORD='rel_path', IMAGE_DATAFRAME_KEYWORD='image_filename', use_tqdm_notebook_widget=True, label_image_stats=True): image_filenames = glob.glob(glob_path) image_files_metadata = pd.DataFrame(image_filenames, columns=[IMAGE_DATAFRAME_KEYWORD]) image_files_metadata['sample_id'] = image_files_metadata[ 'image_filename'].map(lambda s: s.split('/')[-3].split('.')[0]) image_files_metadata['is_tumor'] = image_files_metadata['sample_id'].apply( lambda s: int(s[13:15]) < 10).astype(int) strip_shared_path = image_files_metadata['image_filename'].str.split( '/').apply(pd.Series) idx = np.where(strip_shared_path.apply(pd.Series.nunique) != 1)[0][0] image_files_metadata[ BOTTLENECK_DATAFRAME_KEYWORD] = strip_shared_path.iloc[:, idx:].apply( lambda s: os.path.join(cache_directory, '/'.join(s) + '_cached.txt' ), axis=1) image_files_metadata = image_files_metadata.sample( frac=1, random_state=0).reset_index(drop=True) assert validation_percentage + testing_percentage < 100, "There are not enough training samples" getSHA1 = lambda s: hashlib.sha1(compat.as_bytes(s)).hexdigest() image_files_metadata['sample_id_SHA1'] = image_files_metadata[ 'sample_id'].map(getSHA1) assert not image_files_metadata[[ 'sample_id_SHA1', 'sample_id' ]].drop_duplicates()['sample_id_SHA1'].duplicated().any( ), "SHA1 produced duplicates!!!" # The 'crossval_group' assigned to tiles may highly correlated training and testing data # To avoid that let's re-assign each patient to one category (training,testing,validation): MAX_NUM_IMAGES_PER_CLASS = 2**27 - 1 # ~134M image_files_metadata['crossval_group'] = image_files_metadata[ 'sample_id_SHA1'].apply(lambda x: (int(x, 16) % ( MAX_NUM_IMAGES_PER_CLASS + 1)) / MAX_NUM_IMAGES_PER_CLASS * 100) if validation_percentage > 0: image_files_metadata['crossval_group'] = pd.cut( image_files_metadata['crossval_group'], [ -1, testing_percentage, testing_percentage + validation_percentage, 100 ], labels=['testing', 'validation', 'training']) else: image_files_metadata['crossval_group'] = pd.cut( image_files_metadata['crossval_group'], [-1, testing_percentage, 100], labels=['testing', 'training']) if use_tqdm_notebook_widget: tqdm_notebook.pandas(desc='Labeling...') else: tqdm.pandas(desc='Labeling...') def read_tile_and_calculate_intensity_stats(x): try: x_array = plt.imread(x) except (OSError, TypeError): print( '\nThe following file seems to be corrupted: {:s}\n'.format(x)) x_array = np.array([np.nan]) return calculate_intensity_stats(x_array) temp = image_files_metadata[IMAGE_DATAFRAME_KEYWORD].progress_apply( read_tile_and_calculate_intensity_stats) assert temp.shape[1] == len( task_class_counts_dict ), "The number of tasks needs to match the number of fields produced by calculate_intensity_stats" image_files_metadata = pd.concat([image_files_metadata, temp], axis=1) for label, class_count in task_class_counts_dict.items(): image_files_metadata[label + '_label'] = pd.qcut( image_files_metadata[label], class_count, labels=False) return image_files_metadata
import matplotlib.pyplot as plt import seaborn as sns from sklearn.metrics import accuracy_score, roc_auc_score from sklearn.model_selection import KFold, BaseCrossValidator from sklearn.decomposition import TruncatedSVD, FastICA, FactorAnalysis from sklearn.random_projection import SparseRandomProjection from sklearn.feature_selection import VarianceThreshold from sklearn.pipeline import FeatureUnion from sklearn.linear_model import Ridge from sklearn.preprocessing import scale from scipy.stats import skew, kurtosis, gmean, ks_2samp import gc import psutil from tqdm._tqdm_notebook import tqdm_notebook as tqdm tqdm.pandas() sns.set(style="white", color_codes=True) # In[ ]: train = pd.read_csv('../input/train.csv') test = pd.read_csv('../input/test.csv') train.head() # # Feature engineering # We will start by defining basic row aggregation features. These are used in most public kernels so I will not further elaborate on this part. # In[ ]:
from collections import Counter import tensorflow as tf from common.MultiVectorizer import * import pandas as pd from tensorflow.keras.layers import Dense, Embedding, Input, LSTM, TimeDistributed, SpatialDropout1D, Conv1D, MaxPooling1D, Dropout, AdditiveAttention, Attention, \ GlobalAveragePooling1D, Concatenate, Bidirectional from tensorflow.keras.models import Model from common.data_utils import * from tensorflow.keras.callbacks import Callback from sklearn.preprocessing import MultiLabelBinarizer from tqdm._tqdm_notebook import tqdm_notebook from tensorflow.keras.metrics import * tqdm_notebook.pandas() class AutoEncoderTextModel(): def __init__(self, vectorizer=None, load_weights=False): self.vectorizer = vectorizer self.load_weights = load_weights self.METRICS = [ BinaryAccuracy(name='accuracy'), Precision(name='precision'), Recall(name='recall'), AUC(name='auc') ] def load_data(self, file_path, rows=None, validation_split=None): data_df = pd.read_excel(file_path, nrows=rows) if validation_split is not None:
for i in tqdm_notebook(range(dim_0)): x[i] = mat[i:TIME_STEPS+i] y[i] = mat[TIME_STEPS+i, y_col_index] # if i < 10: # print(i,"-->", x[i,-1,:], y[i]) print("length of time-series i/o",x.shape,y.shape) return x, y stime = time.time() print(os.listdir(INPUT_PATH)) df_ge = pd.read_csv(os.path.join(INPUT_PATH, "ge.us.txt"), engine='python') print(df_ge.shape) print(df_ge.columns) display(df_ge.head(5)) tqdm_notebook.pandas('Processing...') # df_ge = process_dataframe(df_ge) print(df_ge.dtypes) train_cols = ["Open","High","Low","Close","Volume"] df_train, df_test = train_test_split(df_ge, train_size=0.8, test_size=0.2, shuffle=False) print("Train--Test size", len(df_train), len(df_test)) # scale the feature MinMax, build array x = df_train.loc[:,train_cols].values min_max_scaler = MinMaxScaler() x_train = min_max_scaler.fit_transform(x) x_test = min_max_scaler.transform(df_test.loc[:,train_cols]) print("Deleting unused dataframes of total size(KB)",(sys.getsizeof(df_ge)+sys.getsizeof(df_train)+sys.getsizeof(df_test))//1024) del df_ge
import numpy as np import pandas as pd import os import matplotlib.pylab as plt # plt.style.use("fivethirtyeight") plt.style.use('ggplot') #选择绘图风格,就是能好看一点! import seaborn as sns #类似matplotlib的画图包 import gc #gc.collect(),显式回收内存,见② sns.set(style="ticks", color_codes=True) #设置画图空间为 Seaborn 默认风格 import matplotlib.pyplot as plt from tqdm._tqdm_notebook import tqdm_notebook as tqdm #③ tqdm.pandas() #③ import datetime #关于plotly库④ #import plotly.offline as ply import ply #ply.init_notebook_mode(connected=True) import plotly.graph_objs as go import plotly.tools as tls import warnings warnings.filterwarnings('ignore') #不显示warning #functions #pictures # Read in the dataframes import pandas as pd import gc
# libs for visualization import matplotlib.pyplot as plt import seaborn as sns import altair as alt import plotly.offline as py import plotly.graph_objs as go sns.set_style('whitegrid') # libraries import logging.config from pymongo import MongoClient import tqdm from tqdm._tqdm_notebook import tqdm_notebook as tn tn.pandas() mc = MongoClient('mongodb://*****:*****@IP') def tgn(msg: str, alarmer_keys=None): s = Session() if alarmer_keys is None: alarmer_keys = ['YOUR_TOKEN_FROM_t.me/alarmer_bot'] parts = msg.split('\n\n') for ak in alarmer_keys: for part in parts: s.get('https://alarmerbot.ru/', params={'key': ak, 'message': part}) def set_logging(level="DEBUG", formatting=None, disable_existing=False, console=True, file=False, path=None):
import re import pandas as pd from janome.tokenizer import Tokenizer from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.decomposition import TruncatedSVD from tqdm._tqdm_notebook import tqdm_notebook tqdm_notebook.pandas(desc="hoge progress: ") def delete_brackets(s): """ 括弧と括弧内文字列を削除 """ """ brackets to zenkaku """ table = { "(": "(", ")": ")", "<": "<", ">": ">", "{": "{", "}": "}", "[": "[", "]": "]" } for key in table.keys(): s = s.replace(key, table[key]) """ delete zenkaku_brackets """ l = ['([^(|^)]*)', '【[^【|^】]*】', '<[^<|^>]*>', '[[^[|^]]*]', '「[^「|^」]*」', '{[^{|^}]*}', '〔[^〔|^〕]*〕', '〈[^〈|^〉]*〉'] for l_ in l: s = re.sub(l_, "", s) """ recursive processing """