def listfiles(path, ext_list=None): "returns a pair of (basename_list, absolute_path_list) for given dir, optionally filtered by extension" path_list = lmap(lambda fname: os.path.abspath(join(path, fname)), os.listdir(path)) if ext_list: path_list = lfilter(lambda path: os.path.splitext(path)[1] in ext_list, path_list) path_list = sorted(filter(os.path.isfile, path_list)) return lmap(os.path.basename, path_list)
def decode_pred(pred: EvalPrediction) -> Tuple[List[str], List[str]]: pred_str = tokenizer.batch_decode(pred.predictions, skip_special_tokens=True) label_str = tokenizer.batch_decode(pred.label_ids, skip_special_tokens=True) pred_str = lmap(str.strip, pred_str) label_str = lmap(str.strip, label_str) return pred_str, label_str
def read_data(self, data_path): with open(data_path, "rt") as file_: func = lambda line: [1] + lmap(float, line.split()) data = lmap(func, file_.readlines()) return data
def main(): m = {'#': 1, '.': 0} l = lmap(lambda x: [m[i] for i in x[:-1]], read_lines()) ast = [] for i in range(len(l)): for j in range(len(l[0])): if l[i][j]: ast.append((j, i)) s = [] for i in ast: angles = set() for j in ast: if i != j: k = compute_k(i, j) angles.add(k) s.append(len(angles)) i = np.argmax(s) print(s[i]) best = ast[i] dd = collections.defaultdict(list) for i in ast: if i != best: k = compute_k(i, best) dd[k].append(i) d = dict(dd) s = sorted([j for i in d.values() for j in compute_angles_group(i, best)]) print(s[199][1])
def expand_videos(msid, video): gc_data = metadata(msid) # cached on first hit gc_id_str = ", ".join(gc_data.keys()) v_id = video['id'] ensure(gc_data, "glencoe doesn't know %r, it doesn't have any media") ensure(v_id in gc_data, "glencoe doesn't know %r, only %r" % (v_id, gc_id_str)) video_data = gc_data[v_id] video_data = utils.subdict(video_data, ['jpg_href', 'width', 'height']) video_data = utils.renkeys(video_data, [('jpg_href', 'image')]) func = lambda mtype: OrderedDict([ ('mediaType', SOURCES[mtype]), ('uri', gc_data[v_id][mtype + "_href"]) ]) video_data['sources'] = lmap(func, SOURCES) video.update(video_data) del video['uri'] # returned by elife-tools, not part of spec # Add placeholder, the video thumbnail image video["placeholder"] = OrderedDict() video["placeholder"]["uri"] = video["image"].split('/')[-1] video["placeholder"]["alt"] = "" return video
def main(): line = input() intcode = lmap(int, line.split(',')) rv = run_intcode(intcode, 1) print('1.)', rv) rv = run_intcode(intcode, 5) print('2.)', rv)
def expand_videos(msid, video): gc_data = metadata(msid) # cached on first hit gc_id_str = ", ".join(gc_data.keys()) v_id = video['id'] ensure(v_id in gc_data, "glencoe doesn't know %r, only %r" % (v_id, gc_id_str)) video_data = gc_data[v_id] video_data = utils.subdict(video_data, ['jpg_href', 'width', 'height']) video_data = utils.renkeys(video_data, [('jpg_href', 'image')]) func = lambda mtype: { 'mediaType': SOURCES[mtype], 'uri': gc_data[v_id][mtype + "_href"] } video_data['sources'] = lmap(func, SOURCES) video.update(video_data) del video['uri'] # returned by elife-tools, not part of spec # Add placeholder, the video thumbnail image video["placeholder"] = {} video["placeholder"]["uri"] = video["image"].split('/')[-1] video["placeholder"]["alt"] = "" return video
def _generative_step(self, batch: dict) -> dict: t0 = time.time() # generated_ids = self.model.generate( # batch["input_ids"], # attention_mask=batch["attention_mask"], # use_cache=True, # decoder_start_token_id=self.decoder_start_token_id, # ) generated_ids = self.model.generate( batch["input_ids"], attention_mask=batch["attention_mask"], encoder_answer_relevance_atten=batch['answer_relevance_atten'], use_cache=True, decoder_start_token_id=self.decoder_start_token_id, ) gen_time = (time.time() - t0) / batch["input_ids"].shape[0] preds: List[str] = self.ids_to_clean_text(generated_ids) target: List[str] = self.ids_to_clean_text(batch["labels"]) loss_tensors = self._step(batch) base_metrics = { name: loss for name, loss in zip(self.loss_names, loss_tensors) } rouge: Dict = self.calc_generative_metrics(preds, target) summ_len = np.mean(lmap(len, generated_ids)) base_metrics.update(gen_time=gen_time, gen_len=summ_len, preds=preds, target=target, **rouge) return base_metrics
def _generative_step(self, batch: dict) -> dict: start_time = time.time() generated_ids = self.model.generate( batch["input_ids"], attention_mask=batch["attention_mask"], do_deduplication=False, # rag specific parameter use_cache=True, min_length=1, max_length=self.target_lens["val"], ) gen_time = (time.time() - start_time) / batch["input_ids"].shape[0] preds: List[str] = self.ids_to_clean_text(generated_ids) target: List[str] = self.ids_to_clean_text(batch["decoder_input_ids"]) loss_tensors = self._step(batch) base_metrics = { name: loss for name, loss in zip(self.loss_names, loss_tensors) } gen_metrics: Dict = self.calc_generative_metrics(preds, target) summ_len = np.mean(lmap(len, generated_ids)) base_metrics.update(gen_time=gen_time, gen_len=summ_len, preds=preds, target=target, **gen_metrics) return base_metrics
def _generative_step(self, batch: dict) -> dict: pad_token_id = self.tokenizer.pad_token_id source_ids, source_mask, y = SummarizationDataset.trim_seq2seq_batch( batch, pad_token_id) t0 = time.time() generated_ids = self.model.generate( input_ids=source_ids, attention_mask=source_mask, use_cache=True, decoder_start_token_id=self.decoder_start_token_id, ) gen_time = (time.time() - t0) / source_ids.shape[0] preds = self.ids_to_clean_text(generated_ids) target = self.ids_to_clean_text(y) loss_tensors = self._step(batch) base_metrics = { name: loss for name, loss in zip(self.loss_names, loss_tensors) } rouge: Dict = self.calc_generative_metrics(preds, target) summ_len = np.mean(lmap(len, generated_ids)) base_metrics.update(gen_time=gen_time, summ_len=summ_len, preds=preds, target=target, **rouge) return base_metrics
def _generative_step(self, batch: dict) -> dict: t0 = time.time() # parser.add_argument('--eval_max_gen_length', type=int, default=None, help='never generate more than n tokens') generated_ids = self.model.generate( batch["input_ids"], attention_mask=batch["attention_mask"], use_cache=True, decoder_start_token_id=self.decoder_start_token_id, num_beams=self.eval_beams, max_length=self.eval_max_length, ) gen_time = (time.time() - t0) / batch["input_ids"].shape[0] preds: List[str] = self.ids_to_clean_text(generated_ids) target: List[str] = self.ids_to_clean_text(batch["labels"]) loss_tensors = self._step(batch) base_metrics = { name: loss for name, loss in zip(self.loss_names, loss_tensors) } rouge: Dict = self.calc_generative_metrics(preds, target) summ_len = np.mean(lmap(len, generated_ids)) base_metrics.update(gen_time=gen_time, gen_len=summ_len, preds=preds, target=target, **rouge) return base_metrics
def _generative_step(self, batch: dict) -> dict: t0 = time.time() # TODO(LISA) # write the prompt generation from self.model. # parser.add_argument('--eval_max_gen_length', type=int, default=None, help='never generate more than n tokens') # get the prompt: bsz = batch["input_ids"].size(0) prefix_prompt = self.model.get_prompt(bsz=bsz, sample_size=self.eval_beams) # print(prefix_prompt) generated_ids = self.seq2seq_model.generate( batch["input_ids"], past_key_values=prefix_prompt, attention_mask=batch["attention_mask"], use_cache=True, length_penalty=self.hparams.length_penalty, use_prefix=True, decoder_start_token_id=self.decoder_start_token_id, num_beams=self.eval_beams, min_length=self.eval_min_length, max_length=self.eval_max_length, ) gen_time = (time.time() - t0) / batch["input_ids"].shape[0] preds: List[str] = self.ids_to_clean_text(generated_ids) target: List[str] = self.ids_to_clean_text(batch["labels"]) loss_tensors = self._step(batch) base_metrics = {name: loss for name, loss in zip(self.loss_names, loss_tensors)} # print('INPUT:', self.ids_to_clean_text(batch["input_ids"])) # print(preds, target) rouge: Dict = self.calc_generative_metrics(preds, target) summ_len = np.mean(lmap(len, generated_ids)) base_metrics.update(gen_time=gen_time, gen_len=summ_len, preds=preds, target=target, **rouge) return base_metrics
def test_distill_checkpointing_with_teacher(self): updates = dict( student_encoder_layers=2, student_decoder_layers=1, max_epochs=4, val_check_interval=0.25, alpha_hid=2.0, model_name_or_path="IGNORE_THIS_IT_DOESNT_GET_USED", ) model = self._test_distiller_cli(updates, check_contents=False) ckpts = list(Path(model.output_dir).glob("*.ckpt")) self.assertEqual(1, len(ckpts)) transformer_ckpts = list(Path(model.output_dir).glob("**/*.bin")) self.assertEqual(len(transformer_ckpts), 2) examples = lmap( str.strip, model.hparams.data_dir.joinpath("test.source").open().readlines()) out_path = tempfile.mktemp() generate_summaries_or_translations(examples, out_path, str(model.output_dir / "best_tfmr")) self.assertTrue(Path(out_path).exists()) out_path_new = tempfile.mkdtemp() convert_pl_to_hf(ckpts[0], transformer_ckpts[0].parent, out_path_new) assert os.path.exists(os.path.join(out_path_new, "pytorch_model.bin"))
def main(xml_dir, json_output_dir): paths = lmap(lambda fname: join(xml_dir, fname), os.listdir(xml_dir)) paths = lfilter(lambda path: path.lower().endswith('.xml'), paths) paths = sorted(paths, reverse=True) num_processes = 2 Parallel(n_jobs=num_processes)(delayed(render)(path, json_output_dir) for path in paths) print('see scrape.log for errors')
def main(): line = input() intcode = lmap(int, line.split(',')) intcode.extend([0] * 1000) O, cost = task1(intcode.copy()) print("1.)", cost) cnt = task2(intcode, O) print("2.)", cnt)
def main(): line = input() intcode = lmap(int, line.split(',')) intcode.extend([0] * 10000) r = task1(intcode) print("1.)", r) r = task2(intcode) print("2.)", r)
def main(xml_dir, json_output_dir, num=None): paths = lmap(lambda fname: join(xml_dir, fname), os.listdir(xml_dir)) paths = lfilter(lambda path: path.lower().endswith('.xml'), paths) paths = sorted(paths, reverse=True) if num: paths = paths[:num] # only scrape first n articles num_processes = -1 Parallel(n_jobs=num_processes)(delayed(render)(path, json_output_dir) for path in paths) print('see scrape.log for errors')
def read_labels(self, labels_path): with open(labels_path, "rt") as file_: func = lambda line: lmap(int, reversed(line.split())) labels = dict(map(func, file_.readlines())) return labels
def serialize_overrides(override_map): def serialize(pair): key, val = pair ensure(isinstance(key, basestring), "key must be a string") ensure('|' not in key, "key must not contain a pipe") key = key.strip() ensure(key, "key must not be empty") return '|'.join([key, json.dumps(val)]) return lmap(serialize, override_map.items())
def pdf_uri(triple): """predict an article's pdf url. some article types don't have a PDF (like corrections) and some older articles that should have a pdf, don't. this function doesn't concern itself with those latter exceptions.""" content_type, msid, version = triple if content_type and any(lmap(lambda type: type in ['Correction', 'Retraction'], content_type)): return EXCLUDE_ME filename = "elife-%s-v%s.pdf" % (utils.pad_msid(msid), version) # ll: elife-09560-v1.pdf return cdnlink(msid, filename)
def figures_pdf_uri(triple): graphics, msid, version = triple filename_match = '-figsupp' if any(lmap(lambda graphic: graphic.get('xlink_href') and filename_match in graphic.get('xlink_href'), graphics)): filename = "elife-%s-figures-v%s.pdf" % (utils.pad_msid(msid), version) # ll: elife-09560-figures-v1.pdf figures_pdf_cdnlink = cdnlink(msid, filename) return cdn.url_exists(figures_pdf_cdnlink, msid) else: return None
def deserialize_overrides(override_list): def splitter(string): if isinstance(string, list): pair = string # already split into pairs, return what we have return pair ensure('|' in string, "override key and value must be seperated by a pipe '|'") first, rest = string.split('|', 1) ensure(rest.strip(), "a value must be provided. use 'null' without quotes to use an empty value") return first, rest pairs = lmap(splitter, override_list) return OrderedDict([(key, utils.json_loads(val)) for key, val in pairs])
def get_spectrograms(self, path, folder_name, pattern='', ignore_names=None): """ The base_loader allows the most flexibility in structuring your files, though it is probably overly complicated at the moment. Other loaders are designed simpler that inherit from base loader and modify this method. :param path: Base path containing folders that can contain events and noise. Example: /path/California /path/Oklahoma /path/Hawaii :param folder_name: Could be "noise" or "quakes" if following recommended file organization. Will return paths within this folder. Example: /path/California/noise /path/California/quakes ... :param pattern: Pattern to access spectrogram folders within the folder_name. The default setting assumes the files set up as something like: /path/California/quakes/1/first_component.png /pathCalifornia/quakes/1/second_component.png /path/California/quakes/1/vertical_component.png ... where the names of the components can be anything there should be 3 components per folder. Custom patterns are passed as regex as a string and can allow specifying locations to the spectrograms within the subfolders of the path: /path/California/pattern/quakes/1/first_component.png :param ignore_names: :return: """ folders = lmap(os.path.basename, glob.glob(os.path.join(path, '*'))) folders = [f for f in folders if f not in ignore_names] def get_file_paths(folder_path): folders_path = os.path.join(path, folder_path, pattern, folder_name, '*/') subfolder_paths = glob.glob(folders_path) return self.get_components(subfolder_paths) file_paths = [] for folder in folders: file_paths += get_file_paths(folder) # Maintain the same order each time, guaranteed with sorting file_paths.sort() return file_paths
def main(): line = input() intcode = lmap(int, line.split(',')) intcode.extend([0] * 100000) x = IntCodeProgram(intcode) code = x.run_intcode(iter([1])) print(code) x = IntCodeProgram(intcode) code = x.run_intcode(iter([2])) print(code)
def main(): line = input() intcode = lmap(int, line.split(',')) r = run_intcode(intcode) print('1.)', r) for n, v in itertools.product(range(100), range(100)): r = run_intcode(intcode, n, v) if r == 19690720: print('2.)', n * 100 + v) break
def main(): intcode = lmap(int, input().split(',')) intcode.extend([0] * 1000) computers = [] for ip in range(50): pc = IntCodeProgram(intcode) pc.add_input(ip) computers.append(pc) print("1.)", task1(deepcopy(computers))) print("2.)", task2(computers))
def pdf_uri(triple): """predict an article's pdf url. some article types don't have a PDF (like corrections) and some older articles that should have a pdf, don't. this function doesn't concern itself with those latter exceptions.""" content_type, msid, version = triple if content_type and any( lmap(lambda type: type in ['Correction'], content_type)): return EXCLUDE_ME filename = "elife-%s-v%s.pdf" % (utils.pad_msid(msid), version ) # ll: elife-09560-v1.pdf return cdnlink(msid, filename)
def main(args=None): target = first(args) or conf.JSON_DIR if os.path.isdir(target): paths = lmap(lambda fname: join(target, fname), os.listdir(target)) paths = sorted(paths, reverse=True) else: paths = [os.path.abspath(target)] paths = lfilter(lambda path: path.lower().endswith('.json'), paths) print('jobs %d' % len(paths)) Parallel(n_jobs=-1)(delayed(job)(path) for path in paths) print('see validate.log for errors')
def task2(intcode): intcode[0] = 2 prog = IntCodeProgram(intcode) out = prog.run() out = ''.join(lmap(chr, out)) robot_pos = out.find('^') out = lmap(ord, out[:out.find('\n\n') + 1]) i = out.index(ord('\n')) a = np.array(out).reshape(-1, i + 1) robot_pos = robot_pos // a.shape[1] + 1, robot_pos % a.shape[1] + 1 moves = find_moves(np.pad(a, 1, constant_values=ord('.')), robot_pos) # solution found by hand using vim m = ('C,A,C,B,C,A,B,C,A,B\n' 'R,6,L,8,R,10\n' 'L,8,R,4,R,4,R,6\n' 'R,12,R,4,R,10,R,12\nn\n') for i in m: prog.add_input(ord(i)) out = prog.run() return out[-1]
def _generative_step(self, batch: dict) -> dict: t0 = time.time() generated_ids = self.model.generate( batch["input_ids"], attention_mask=batch["attention_mask"], use_cache=True, decoder_start_token_id=self.decoder_start_token_id, num_beams=self.eval_beams, no_repeat_ngram_size=0, min_length=0, max_length=self.eval_max_length, length_penalty=1.0) gen_time = (time.time() - t0) / batch["input_ids"].shape[0] preds: List[str] = self.ids_to_clean_text(generated_ids) target: List[str] = self.ids_to_clean_text(batch["labels"]) y = batch["labels"] decoder_input_ids = y[:, :-1].contiguous() lm_labels = y[:, 1:].clone() a = self.tokenizer.batch_decode(batch["input_ids"].tolist()) #b = self.tokenizer.batch_decode(batch["labels"].tolist()) b = self.tokenizer.batch_decode(lm_labels.tolist()) c = self.tokenizer.batch_decode(generated_ids) pad_token_id = self.tokenizer.pad_token_id tgt_ids = batch["labels"] # if isinstance(self.model, T5ForConditionalGeneration): # decoder_input_ids = self.model._shift_right(tgt_ids) # else: # decoder_input_ids = shift_tokens_right(tgt_ids, pad_token_id) e = self.tokenizer.batch_decode(decoder_input_ids.tolist()) loss_tensors = self._step(batch) base_metrics = { name: loss for name, loss in zip(self.loss_names, loss_tensors) } rouge: Dict = self.calc_generative_metrics(preds, target) summ_len = np.mean(lmap(len, generated_ids)) base_metrics.update(gen_time=gen_time, gen_len=summ_len, preds=preds, target=target, a=a, b=b, c=c, e=e, **rouge) return base_metrics
def figures_pdf_uri(triple): graphics, msid, version = triple filename_match = '-figsupp' if any( lmap( lambda graphic: graphic.get('xlink_href') and filename_match in graphic.get('xlink_href'), graphics)): filename = "elife-%s-figures-v%s.pdf" % ( utils.pad_msid(msid), version) # ll: elife-09560-figures-v1.pdf figures_pdf_cdnlink = cdnlink(msid, filename) return cdn.url_exists(figures_pdf_cdnlink, msid) else: return None
def mixed_citation_to_related_articles(mixed_citation_list): # ll: [{'article': {'authorLine': 'R Straussman et al', # 'authors': [{'given': u'R', 'surname': u'Straussman'}, ...}], # 'doi': u'10.1038/nature11183', 'pub-date': [2014, 2, 28], 'title': u'Pants-Party'}, # 'journal': {'volume': u'487', 'lpage': u'504', 'name': u'Nature', 'fpage': u'500'}}] def et(struct): return OrderedDict([ ('type', 'external-article'), ('articleTitle', p(struct, 'article.title')), ('journal', p(struct, 'journal.name')), ('authorLine', p(struct, 'article.authorLine')), ('uri', 'https://doi.org/%s' % p(struct, 'article.doi')), ]) return lmap(et, mixed_citation_list)
def _generative_step(self, batch): pad_token_id = self.tokenizer.pad_token_id source_ids, source_mask, y = SummarizationDataset.trim_seq2seq_batch(batch, pad_token_id) # TODO(SS): task specific params t0 = time.time() generated_ids = self.model.generate(input_ids=source_ids, attention_mask=source_mask, use_cache=True,) gen_time = time.time() - t0 preds = self.ids_to_clean_text(generated_ids) target = self.ids_to_clean_text(y) loss_tensors = self._step(batch) base_metrics = {name: loss for name, loss in zip(self.loss_names, loss_tensors)} rouge: Dict = calculate_rouge(preds, target) summ_len = np.mean(lmap(len, generated_ids)) base_metrics.update(gen_time=gen_time, summ_len=summ_len, preds=preds, target=target, **rouge) return base_metrics
def main(): img = lmap(int, input()) img = np.array(img) img.shape = (-1, 6, 25) nz = np.sum(img == 0, axis=1).sum(axis=1) a = img[np.argmin(nz)] print('1.)', np.equal(a, 1).sum() * np.equal(a, 2).sum()) a = np.ones(img.shape[1:]) * 2 mask = np.zeros(img.shape[1:]).astype(bool) for i in img: a[mask] = i[mask] mask = a == 2 plt.imshow(a) plt.show()
def deserialize_overrides(override_list): def splitter(string): if isinstance(string, list): pair = string # already split into pairs, return what we have return pair ensure('|' in string, "override key and value must be seperated by a pipe '|'") first, rest = string.split('|', 1) ensure( rest.strip(), "a value must be provided. use 'null' without quotes to use an empty value" ) return first, rest pairs = lmap(splitter, override_list) return {key: json.loads(val) for key, val in pairs}
def mixed_citation_to_related_articles(mixed_citation_list): # ll: [{'article': {'authorLine': 'R Straussman et al', # 'authors': [{'given': u'R', 'surname': u'Straussman'}, ...}], # 'doi': u'10.1038/nature11183', 'pub-date': [2014, 2, 28], 'title': u'Pants-Party'}, # 'journal': {'volume': u'487', 'lpage': u'504', 'name': u'Nature', 'fpage': u'500'}}] def et(struct): return { 'type': 'external-article', 'articleTitle': p(struct, 'article.title'), 'journal': p(struct, 'journal.name'), 'authorLine': p(struct, 'article.authorLine'), 'uri': 'https://doi.org/%s' % p(struct, 'article.doi'), } return lmap(et, mixed_citation_list)
def main(args): import argparse parser = argparse.ArgumentParser() parser.add_argument('--dry-run', action='store_true') parser.add_argument('paths', nargs="*") args = parser.parse_args(args) # read any filenames that were passed in as arguments paths = args.paths # failing that, try reading from stdin if not paths: paths = read_from_stdin() try: paths = lmap(json.loads, paths) except ValueError: # assume filenames. pass return do_paths(paths, dry_run=args.dry_run)