def __init__( self, id, dataset_dir, output_dir, n_splits, base_train_config, folds, ): params = locals() torch.manual_seed(0) ids = pipe(range(n_splits), filter(lambda x: x in folds), list) train_df_path = delayed(load_train_df)( dataset_dir=join(dataset_dir, 'train'), output=join(output_dir, 'train.pqt')) train_df = delayed(pd.read_parquet)(train_df_path) kfolded = delayed(kfold)(train_df, n_splits) train_sets = pipe(ids, map(lambda x: delayed(lambda i: i[x])(kfolded)), list) model_paths = pipe( zip(ids, train_sets), map(lambda x: delayed(train_fusion)( **base_train_config, model_path=join(output_dir, f"{id}-fold-{x[0]}-base-model.pt"), sets=x[1], log_dir=f'{config["TENSORBORAD_LOG_DIR"]}/{id}/{x[0]}/base', )), list) test_df_path = load_test_df(dataset_dir='/store/tellus/test', output=join(output_dir, 'test.pqt')) test_df = delayed(pd.read_parquet)(test_df_path) test_dataset = delayed(TellusDataset)( test_df, has_y=False, ) submission_df_path = delayed(predict)( model_paths=model_paths, log_dir=f'{config["TENSORBORAD_LOG_DIR"]}/{id}/sub', dataset=test_dataset, log_interval=10, out_path=f'{output_dir}/{id}_submission.tsv', ) self.output = delayed(lambda x: x)(( model_paths, submission_df_path, ))
def delete_cascade(self, id): import mlboard_api.query as qs pipe( self.get_children(id), map(lambda x: ( getattr(qs, x.__class__.__name__)( session=self.session ).delete_cascade(x.id) )), list ) self.filter(self.entitiy_class.id == id).delete() return id
def predict( model_dirs, dataset, out_path, batch_size=512, ): device = torch.device("cuda") models = pipe(model_dirs, map(lambda x: os.path.join(x, '*.pt')), map(glob.glob), concat, map(torch.load), map(lambda x: x.eval().to(device)), list) loader = DataLoader( dataset, batch_size=batch_size, shuffle=False, pin_memory=True, ) rows = [] y_preds = [] y_ids = [] with torch.no_grad(): for sample in loader: ids = sample['id'] palser_x = sample['palsar'].to(device) normal_outputs = pipe( models, map(lambda x: x(palser_x)[0]), list, ) output = pipe( [*normal_outputs], map(lambda x: x.softmax(dim=1)), reduce(lambda x, y: (x + y) / 2), lambda x: x.argmax(dim=1), ) y_ids += ids y_preds += output.cpu().detach().tolist() rows = pipe(zip(y_ids, y_preds), map(lambda x: { 'id': x[0], 'lable': x[1] }), list) df = pd.DataFrame(rows) df.to_csv(out_path, sep='\t', header=False, index=False) return out_path
def __init__(self, feature_size=8, depth=3, ): super().__init__() self.down_layers = nn.ModuleList([ DownSample(1, feature_size * 2 ** depth), *pipe( range(depth), reversed, map(lambda x: DownSample( feature_size * (2 ** (x + 1)), feature_size * (2 ** x), )), list, ) ]) self.center = DownSample( in_ch=feature_size, out_ch=feature_size, ) self.up_layers = nn.ModuleList([ *pipe( self.down_layers, reversed, map(lambda x: x.out_ch), take(depth), map(lambda x: UpSample( feature_size, feature_size, x, )), list, ), UpSample( feature_size, feature_size, feature_size * 2 ** depth, ), ]) self._output = nn.Conv2d( feature_size, 2, kernel_size=3 )
def forward(self, x, others, size): out = pipe([x, *others], map(lambda x: F.interpolate(x, mode='bilinear', size=size)), list) out = torch.cat([*out], 1) out = self.block(out) return out
def validate(predicts, dataset, batch_size): loader = DataLoader( dataset, batch_size=batch_size, pin_memory=True, shuffle=False, ) y_preds = np.array(predicts).mean(axis=0).argmax(axis=1) y_trues = pipe( loader, map(lambda x: x['label'].cpu().detach().tolist()), reduce(lambda x, y: x + y), np.array, ) score = iou( y_preds, y_trues, ) tn, fp, fn, tp = confusion_matrix(y_trues, y_preds).ravel() return { 'TPR': tp / (tp + fn), 'FNR': fn / (tp + fn), 'FPR': fp / (fp + tn), 'acc': (tp + tn) / (tp + tn + fp + fn), 'pre': tp / (tp + fp), 'iou': tp / (fn + tp + fp), }
def __call__(self, epoch): cyclic = 1.0 phase = epoch % self.period turn_phase, ratio = self.turning_point turn_cyclic = self.min_factor + self.range * ratio if phase <= turn_phase: cyclic = ( self.min_factor + (turn_cyclic - self.min_factor) * phase/turn_phase ) else: cyclic = turn_cyclic + \ (self.max_factor - turn_cyclic) * \ (phase - turn_phase)/(self.period - turn_phase) gamma = pipe( self.milestones, filter(lambda x: x[0] <= epoch), map(lambda x: x[1]), last ) return cyclic * gamma
def get_train_row(base_path, label_dir, label): rows = pipe( [ ("PALSAR", "before"), ("PALSAR", "after"), ("LANDSAT", "before"), ("LANDSAT", "after"), ], map(lambda x: (base_path, *x, label_dir, "*.tif")), map(lambda x: os.path.join(*x)), map(glob.glob), list, lambda x: zip(*x), map(lambda x: list(map(Path)(x))), map(lambda x: { "id": x[0].name, "label": label, "palsar_before": str(x[0]), "palsar_after": str(x[1]), "landsat_before": str(x[2]), "landsat_after": str(x[3]), }), list ) return rows
def take_topk(scores, paths, top_num): return pipe( zip(scores, paths), lambda x: topk(top_num, x, key=lambda y: y[0]), map(lambda x: x[1]), list )
def to_dict(self, convert_values: bool = False) -> MutableMapping[str, Any]: to_fields = curried.pipe( fields(self.__class__), curried.map(lambda a: (a, curried.get_in([to_key], a.metadata))), curried.filter(lambda f: f[1]), list, ) if convert_values: d = asdict(self) else: d = { a.name: getattr(self, a.name) for a in fields(self.__class__) } if not to_fields: return d return curried.reduce( lambda acc, f: curried.update_in(acc, f[1], lambda _: d[f[0]. name]), to_fields, {}, )
def get_hashtag_string(given_item): """Return a string of hashtags associated with the given item""" return tz.pipe( tz.get_in(['entities', 'hashtags'], given_item, default=[]), tz.map(lambda x: tz.get_in(['text'], x, default=None)), tz.filter(lambda x: x is not None), lambda x: ", ".join(x))
def find_domain_urls(self, domain: str) -> List[str]: """ Get all known urls for domain. Returns ------- all_urls : iterator """ def _urlkey_to_url(urlkey): try: # very rare bugged urlkeys appear domain, path = urlkey.split(')/', 1) except ValueError: return domain = domain.split(',') domain.reverse() domain = '.'.join(domain) if path: return '/'.join([domain, path]) return domain urls_by_index = map( lambda ind: self.__get_domain_urls_in_index(ind, domain), self.indexes) all_urls = pipe(urls_by_index, concat, map(bytes.decode), map(_urlkey_to_url), filter(None), map(unquote), map(lambda x: x.strip()), unique, list) return all_urls
def sum_path(): n = 1 pos = (0, 0) side_length = 1 sum_dict = {(0, 0): 1} step_fns = it.cycle([ lambda x: (x[0] + 1, x[1]), lambda x: (x[0], x[1] + 1), lambda x: (x[0] - 1, x[1]), lambda x: (x[0], x[1] - 1) ]) step_fn = next(step_fns) rotation_break_seq = set() while True: if is_odd_square(n - 1): step_fn = next(step_fns) side_length += 2 delta_seq = [ side_length - 2, side_length - 2 + 1, side_length - 2 + 1 ] rotation_break_seq = cc.pipe(it.accumulate([n] + delta_seq), cc.drop(1), set) elif n in rotation_break_seq: step_fn = next(step_fns) sum_dict[pos] = neighbors_sum(pos, sum_dict) yield (pos, sum_dict[pos]) pos = step_fn(pos) n += 1
def serde_with_class(cls): from_fields = list( map(lambda a: (a, get_in([from_key], a.metadata, [a.name])), fields(cls))) to_fields = pipe( fields(cls), map(lambda a: (a, get_in([to_key], a.metadata))), filter(lambda f: f[1]), list, ) def from_dict(d): return cls(**dict( map( lambda f: (f[0].name, get_in(f[1], d, f[0].default)), from_fields, ))) def to_dict(self): d = asdict(self) return reduce( lambda acc, f: update_in(acc, f[1], lambda _: d[f[0].name]), to_fields, {}, ) cls.from_dict = staticmethod(from_dict) cls.to_dict = to_dict return cls
def _get_wf_call_failures(metadata, opts): calls = [] if 'calls' in opts: calls = opts['calls'].split(',') else: calls = metadata['calls'].keys() jobids = None if 'jobids' in opts: jobids = set(opts['jobids'].split(',')) fails = {} for c in calls: tasks = metadata['calls'][c] failures = pipe( tasks, filter(lambda x: get('executionStatus', x) == 'Failed'), filter(lambda x: _valid_job_id(jobids, get('jobId', x))), map( lambda x: { 'jobId': get('jobId', x), # 'inputs' : get('inputs', x), 'stderr': get('stderr', x), 'shard': get('shardIndex', x), 'err_msg': get_in(['failures', 0, 'message'], x, 'NA'), # 'jes' : get('jes', x), # 'runtime' : get('runtimeAttributes', x), 'rc': get('returnCode', x, 'NA'), }), list) fails[c] = failures return fails
def is_not_reg(rle_mask): if(isinstance(rle_mask, str)): return pipe(rle_mask.split(' '), len, lambda x: x > 6) else: return True
def get_summary(annotations: Annotations, labels: Labels) -> t.Any: count = len(annotations) label_count = pipe(annotations, map(lambda x: len(x["label_ids"])), list, np.array) label_hist = { 5: np.sum(label_count == 5), 4: np.sum(label_count == 4), 3: np.sum(label_count == 3), } label_ids = pipe( annotations, mapcat(lambda x: x["label_ids"]), list, np.array, ) total_label_count = len(label_ids) top = pipe( frequencies(label_ids).items(), topk(5, key=lambda x: x[1]), map(lambda x: ( f"{labels[x[0]].category}::{labels[x[0]].detail}", x[1], )), list, ) worst = pipe( frequencies(label_ids).items(), topk(5, key=lambda x: -x[1]), map(lambda x: ( f"{labels[x[0]].category}::{labels[x[0]].detail}", x[1], )), list, ) return { "count": count, "label_hist": label_hist, "label_count_mean": label_count.mean(), "label_count_median": np.median(label_count), "label_count_max": label_count.max(), "label_count_min": label_count.min(), "total_label_count": total_label_count, "top": top, "worst": worst, }
def get_categories(given_dict): """Return a string of the categories associated with a post""" return tz.pipe( tz.get_in(['object', 'tags'], given_dict, default = []), tz.filter(lambda x: tz.get_in(['objectType'], x, default=None) == 'category'), tz.map(lambda x: tz.get_in(['displayName'], x, default=None)), lambda x: ", ".join(x) )
def add_consistency_noise(batch_images, ): filped = batch_images.flip([3]) return pipe( batch_images, map(ramdom_erase), list, torch.stack )
def batch_aug(aug, batch, ch=3): return pipe( batch, map(lambda x: [aug(x[0:ch, :, :]), aug(x[ch:2*ch, :, :])]), map(lambda x: torch.cat(x, dim=0)), list, torch.stack )
def get_segment_indices(dataset, filter_indcies): df = dataset.df filtered = df.iloc[filter_indcies] return pipe( filtered[filtered['is_empty'] == False].index, map(df.index.get_loc), list )
def bulk_insert(self, objects): if(len(objects) > 0): sql = self.entitiy_class.__table__.insert()\ .values(pipe(objects, map(lambda x: x if isinstance( x, dict) else x.to_dict()), list)) self.session.execute(sql) self.session.commit()
def forward(self, palser_x, landsat_x): palser_x = self.pad(palser_x) x = pipe([landsat_x, palser_x], map(lambda x: F.interpolate( x, mode='bilinear', size=(self.resize, self.resize))), list, lambda x: torch.cat(x, dim=1)) x = self.fusion_enc(x) x = self.logit_out(x).view(-1, 2) return x
def save_first(stream_key, stream_iterator): """Save the first entry in the stream as an example""" def parse_entry(given_entry): """parse either the WordPress stream strings, or the semi-parsed twitter stream""" if isinstance(given_entry, str): return json.loads(given_entry) else: return dict(given_entry) file_name = '../data/samples/example_{}.json'.format(stream_key) with open(file_name, 'w') as outfile: tz.pipe( next(stream_iterator), # first entry parse_entry, # parse json.dumps, # unparse outfile.write) # save print("Saved {}".format(file_name)) return True
def __init__(self, in_ch, feature_size=64, depth=3, ratio=2): super().__init__() self.down_layers = nn.ModuleList( [ DownSample( in_ch=in_ch, out_ch=feature_size, ), *pipe( range(depth), map(lambda d: DownSample( in_ch=int(feature_size*ratio**(d)), out_ch=int(feature_size*ratio**(d + 1)), )), list, ) ] ) self.center = DownSample( in_ch=feature_size*ratio**depth, out_ch=feature_size*ratio**depth, ) self.up_layers = nn.ModuleList([ *pipe( range(depth), reversed, map(lambda l: UpSample( in_ch=feature_size * ratio**(l+1) + feature_size*ratio**(l+1), out_ch=feature_size*ratio**l, )), list, ), UpSample( in_ch=feature_size + feature_size, out_ch=feature_size, ), ]) self.out_ch = feature_size
def test_flip(): writer = SummaryWriter(f'{config["TENSORBORAD_LOG_DIR"]}/test') dataset_df = load_dataset_df('/store/kaggle/tgs') dataset = TgsSaltDataset(dataset_df) writer.add_image( f"flip", vutils.make_grid( pipe(range(8), map(lambda x: dataset[12]), map(lambda x: [x['image'], x['mask']]), concat, list)), )
def _post(target, methods, entities=[]): with DBSession() as sess: query_class = eval(f"qry.{target}") q = query_class( entities=pipe(entities, map(lambda x: eval(f'ms.{x}')), list), session=sess, ) for m in methods: q = getattr(q, m['name'])(*m['args'], **m['kwargs']) return q
def find_unbalanced(self): node = self while True: grouped_children = node.grouped('children') unbalanced = cc.pipe(grouped_children, cc.valfilter(lambda x: len(x) == 1), lambda x: cc.first(x.values())[0]) if unbalanced.children_are_balanced: return unbalanced node = unbalanced
def grouped(self, group, key=lambda x: x.weight): if group == 'siblings' and not self.parent: return {self.weight: [self]} elif group in {'siblings', 'children'}: agg = self.siblings if group == 'siblings' else self.children return cc.pipe(((key(x), x) for x in agg), cc.groupby(lambda x: x[0]), cc.valmap(lambda x: [y[1] for y in x])) else: return {}
def add_noise(batch_images, erase_num, erase_p): ramdom_erase = RandomErasing( num=erase_num ) return pipe( batch_images, map(ramdom_erase), list, torch.stack )
def validate(x, y, epoch): score = pipe( zip( x.argmax(dim=1).cpu().detach().numpy(), y.cpu().detach().numpy() ), map(lambda x: iou(*x)), list, np.mean ) return score
def test_kfold(): output = load_train_df( dataset_dir='/store/tellus/train', output='/store/tmp/train.pqt' ) df = pd.read_parquet(output) sets = kfold(df, n_splits=10) for s in sets: assert pipe( s['train_pos'], take(100), map(lambda x: x['label']), filter(lambda x: x == 0), list, len ) == 0 assert pipe( s['val_pos'], take(100), map(lambda x: x['label']), filter(lambda x: x == 0), list, len ) == 0 assert pipe( s['train_neg'], take(100), map(lambda x: x['label']), filter(lambda x: x == 1), list, len ) == 0 assert pipe( s['val_neg'], take(100), map(lambda x: x['label']), filter(lambda x: x == 1), list, len ) == 0 assert len(s) == 4
def __init__(self, epoch_size, len_indices, shuffle=True, start_at=0): self.shuffle = shuffle self.epoch_size = epoch_size self.len_indices = len_indices indices = range(len_indices) self.chunks = pipe( range(0, len_indices//epoch_size), map(lambda x: indices[x*epoch_size:(x+1)*epoch_size]), map(list), list, ) self.chunk_idx = start_at
def worker(pipeline, bam_fname, result_q, contig_q, paired=False, singles_q=None, max_singles=1000, is_singles_mixer=False, single_src_cnt=None): """Given a pipeline, run it with reads from the given bam taken from contigs supplied over the contig_q. This expects the pipeline to yield one final result which it can then return. It expects the last element of pipeline to be a function that consumes a read iterator and returns a result. This is more flexible than you think, since the result can be an iterator, so this can be used to filter reads in parallel. See examples in the filter analysis tutorial :param pipeline: A list of pipelines :param bam_fname: Source BAM file :param result_q: The result is put here. :param contig_q: messages are of the form (ref, True/False) ref is the name of the contig True/False indicates if eof should be set T/F This controls whether we read to end of file including all the unmapped reads. The caller figures out if this is that last contig that sits just before that tail of unmapped reads at the end of the BAM file :param paired: Do we pair the reads before passing them to the pipeline? :param singles_q: messages are SAM strings of reads converted using tostring(). This is only used/relevant if paired=True because we use that to collect the singles from all contigs and pair them up Depending on whether this is the last :param max_singles: When we have these many singles, start passing then to the singles mixer :param is_singles_mixer: Set True if this is also the "singles mixer" that receives unpaired reads from other workers :param single_src_cnt: How many sources of singles we have This is :return: """ if paired and singles_q is None: raise RuntimeError('Need singles_q to be defined if using paired reads') fp = pysam.AlignmentFile(bam_fname) if paired: t1 = paired_read_iter(fp, contig_q, singles_q=singles_q, max_singles=max_singles, is_singles_mixer=is_singles_mixer, single_src_cnt=single_src_cnt) else: t1 = unpaired_read_iter(fp, contig_q) sink = pipeline[-1] result_q.put(sink(cyt.pipe(t1, *pipeline[:-1])))
def reformat_timestamp(given_ts): """Reformat into WordPress.com format""" # Twitter example: "Sat Oct 10 14:48:34 +0000 2015" # WordPress example: "2015-10-10T19:42:34Z" if given_ts is None: return "" try: return tz.pipe( given_ts, lambda x: dt.datetime.strptime(x, "%a %b %d %H:%M:%S +0000 %Y"), lambda x: x.strftime("%Y-%m-%dT%H:%M:%SZ")) except: # If it can't reformat it, just use the previous version return str(given_ts)
def connect_to_twitter_stream(stream_key, saveing_function): """Connect to & consume a Twitter stream""" stream = tz.pipe( ## Connect start_stream_twitter(), # public sampled stream tz.map(print_twitter_stall_warning), ## Filter tz.filter(is_tweet), # filter to tweets # tz.filter(is_user_lang_tweet(["en", "en-AU", "en-au", "en-GB", "en-gb"])), # filter to English ## Parse tz.map(parse_tweet), # parse into a flat dictionary ) # Collect saveing_function(stream_key, stream)
def connect_to_twitter_filtered_stream(stream_key, saveing_function): """Connect to & consume a filtered Twitter stream, where Twitter does some of the filtering""" stream = tz.pipe( ## Connect start_stream_twitter(**CONFIG['twitter_filter']), tz.map(print_twitter_stall_warning), ## Filter tz.filter(is_tweet), # filter to tweets ## Parse tz.map(parse_tweet), # parse into a flat dictionary ) ## Collect saveing_function(stream_key, stream)
def connect_to_wordpress_stream(stream_key, saveing_function): """Connect to & consume a WordPress event stream""" parse_functions = { 'posts': parse_post, 'likes': parse_like, 'comments': parse_comment} stream = tz.pipe( ## Connect start_wordpress_stream(CONFIG['stream_urls'][stream_key]), ## Parse tz.map(permissive_json_load), # parse the JSON, or return an empty dictionary tz.map(parse_functions[stream_key]), # parse into a flat dictionary ) # Collect saveing_function(stream_key, stream)
def worker(pipeline, aaf_fname, result_q, contig_q): """Given a pipeline, run it with reads from the given AAF taken from contigs supplied over the contig_q. This expects the pipeline to yield one final result which it can then return. It expects the last element of pipeline to be a function that consumes a aaf iterator and returns a result. :param pipeline: A list of pipeline nodes :param aaf_fname: Source AAF file :param result_q: The result is put here. :param contig_q: messages are contig names. A None indicates stop_iter :return: """ aaf = pysam.TabixFile(aaf_fname) t1 = aaf_iter(aaf, contig_q) sink = pipeline[-1] result_q.put(sink(cyt.pipe(t1, *pipeline[:-1])))