def test_get_answer(): df = read_csv('test/2020/data/day7.csv', col_names=['outer_bags','inner_bags'], sep='contain', engine='python') answer1 = get_answer1(df) answer2 = get_answer2(df) df2 = read_csv('test/2020/data/day7_2.csv', col_names=['outer_bags','inner_bags'], sep='contain', engine='python') answer2_2 = get_answer2(df2) assert answer1 == 4 assert answer2 == 32 assert answer2_2 == 126
def test_get_answer(): df = read_csv('test/2021/data/day12.csv', col_names=['f', 't'], sep='-') answer1 = get_answer1(df) answer2 = get_answer2(df) assert answer1 == 10 assert answer2 == 36
def test_get_answer(): df = read_csv('test/2020/data/day20.csv', col_names=['XX'], sep=',') answer1 = get_answer1(df) answer2 = get_answer2(df) #assert answer1 == 20899048083289 #assert answer2 == 336
def test_get_answer(): df = read_csv('test/2021/data/day5.csv', col_names=['from','to'], sep='->', engine='python') answer1 = get_answer1(df) answer2 = get_answer2(df) assert answer1 == 5 assert answer2 == 12
def test_get_answer(): df = read_csv('test/2020/data/day9.csv', col_names=['xmas'], sep=',') answer1 = get_answer1(df, 5) answer2 = get_answer2(df, 5) assert answer1 == 127 assert answer2 == 62
def mirnaid_fix(fname: str): d = { "mouse": "mmu", "human": "hsa", "elegans": "cel", "cattle": "bta", "fly": "aga" } prefix = None for k, v in d.items(): if k in fname: prefix = v if prefix is None: raise Exception("unrecognized mirbase prefix") mirbase_df: DataFrame = pd.read_csv(MIRBASE_FILE).query("prefix==@prefix") mirbase_df.sort_values(by="version", ascending=False, inplace=True) mirbase_df.drop_duplicates("miRNA sequence", keep="first", inplace=True) fin_full_path = READ_PATH / fname fout_full_path = MIRNA_SEQ_PATH / fname d: DataFrame = read_csv(fin_full_path) join_df = d.merge(mirbase_df, how="left", left_on="miRNA sequence", right_on="miRNA sequence") d['miRNA ID'] = join_df['miRNA ID_y'] to_csv(d, fout_full_path)
def test_get_answer(): df = read_csv('test/2020/data/day18.csv', col_names=['expr'], sep=',') answer1 = get_answer1(df) answer2 = get_answer2(df) assert answer1 == 26335 assert answer2 == 693891
def test_get_answer(): df_expenses = read_csv('test/2020/data/day1.csv', col_names=['amount']) answer1 = get_answer1(df_expenses) answer2 = get_answer2(df_expenses) assert answer1 == 514579 assert answer2 == 241861950
def main(args): global executor executor = concurrent.futures.ThreadPoolExecutor(16) # Make dataset directories for action in ACTIONS: os.makedirs(os.path.join(args.output_dir, action), exist_ok=True) # Load match file match_file = os.path.join(args.data_root_dir, args.date, 'match_{}_{}.pkl'.format(args.thermal_sensor, args.depth_sensor)) utils.check_exists(match_file) with open(match_file, 'rb') as f: tasks = pickle.load(f) thermal_video_dir = os.path.join(args.data_root_dir, args.date, 'thermal', args.thermal_sensor, 'videos') annotation_dir = os.path.join(args.data_root_dir, args.date, 'results', '{}_{}'.format(args.thermal_sensor, args.depth_sensor)) task_ids = sorted([int(f.split('.')[0]) for f in os.listdir(annotation_dir)]) # Read and process each task. videos = {} for task_id in task_ids: print("Task: {}".format(task_id)) clips = utils.read_csv(os.path.join(annotation_dir, '{}.csv'.format(task_id))) task = tasks[task_id] thermal_time_str = task[0][0] # Raw thermal data frames = get_thermal_frames(videos, thermal_video_dir, thermal_time_str) process_annotations(args, task, clips, frames) print("Waiting for jobs to finish...") executor.shutdown(wait=True)
def test_get_answer(): df = read_csv('test/2020/data/day3.csv', col_names=['pattern'], sep=',') answer1 = get_answer1(df) answer2 = get_answer2(df) assert answer1 == 7 assert answer2 == 336
def human_mapping_merge_by_name(fin: Path, fout: Path): def verify_sequence(seq: str, subseq: str) -> bool: try: return seq.find(subseq) != -1 except AttributeError: return False in_df: DataFrame = read_csv(fin) in_df["join_key"] = in_df["mRNA ID"].apply( lambda x: "|".join(x.split("_")[0:2])) mRNA_df = concatenate_biomart_df("human") in_df = in_df.merge(mRNA_df, how="left", left_on=["region", "join_key"], right_on=["region", "ID"]) in_df = in_df.rename(columns={"sequence": "region sequence"}) in_df = in_df[[ 'key', 'paper name', 'miRNA ID', 'miRNA sequence', 'mRNA ID', 'mRNA_seq_extended', 'region', 'region_sequence', 'mRNA_start', 'mRNA_end_extended' ]] in_df["join_ok"] = in_df.apply(func=get_wrapper(verify_sequence, 'region sequence', 'mRNA_seq_extended'), axis=1) to_csv(in_df, fout)
def test_get_answer(): df = read_csv('test/2021/data/day1.csv', col_names=['XX'], sep=',') answer1 = get_answer1(df) answer2 = get_answer2(df) assert answer1 == 7 assert answer2 == 5
def get_site_from_extended_site(fin: str, fout: str): def calc_chimera_start(seq: str, subseq: str) -> int: try: if seq.find(subseq) == -1: return -1 return seq.find(subseq) + 1 except AttributeError: return -1 def calc_chimera_end(chimera_start: int, seq_extended: str) -> int: if chimera_start == -1: return -1 return chimera_start + len(seq_extended) - 1 - HUMAN_SITE_EXTENDED_LEN logger.info(f"Insert site to {fin}") df: DataFrame = read_csv(Path(fin)) df["chimera_start"] = df.apply(func=get_wrapper(calc_chimera_start, 'region sequence', 'mRNA_seq_extended'), axis=1) df["chimera_end"] = df.apply(func=get_wrapper(calc_chimera_end, 'chimera_start', 'mRNA_seq_extended'), axis=1) df["site"] = df.apply(func=get_wrapper(get_subsequence_by_coordinates, "region sequence", "chimera_start", "chimera_end", extra_chars=SITE_EXTRA_CHARS), axis=1) to_csv(df, Path(fout)) logger.info(f"finish the site sequence insertion to {fin}")
def test_get_answer(): df = read_csv('test/2020/data/day25.csv', col_names=['pub_key'], sep=',') answer1 = get_answer1(df) answer2 = get_answer2(df) assert answer1 == 14897079 #assert answer2 == 336
def test_get_answer(): df = read_csv('test/2021/data/dayX.csv', col_names=['XX'], sep=',') dl = open('test/2021/data/dayX.csv').read().splitlines() answer1 = get_answer1(df) answer2 = get_answer2(df) assert answer1 == 7 assert answer2 == 336
def test_get_answer(): df = read_csv('test/2021/data/day2.csv', col_names=['dir', 'step'], sep=' ') answer1 = get_answer1(df) answer2 = get_answer2(df) assert answer1 == 150 assert answer2 == 900
def test_get_answer(): df = read_csv('test/2020/data/day8.csv', col_names=['operation', 'argument'], sep=' ') answer1 = get_answer1(df) answer2 = get_answer2(df) assert answer1 == 5 assert answer2 == 8
def test_get_answer(): df_passwords = read_csv('test/2020/data/day2.csv', col_names=['policy', 'password'], sep=':') answer1 = get_answer1(df_passwords) answer2 = get_answer2(df_passwords) assert answer1 == 2 assert answer2 == 1
def test_get_answer(): df = read_csv('test/2021/data/day8.csv', col_names=['all', 'shown'], sep='|') answer1 = get_answer1(df) answer2 = get_answer2(df) assert answer1 == 26 assert answer2 == 61229
def test_get_answer(): df = read_csv('test/2021/data/day3.csv', col_names=['XX'], sep=',', dtype=str) answer1 = get_answer1(df) answer2 = get_answer2(df) assert answer1 == 198 assert answer2 == 230
def test_get_answer(): df = read_csv('test/2020/data/day6.csv', col_names=['response'], sep=',', skip_blank_lines=False) answer1 = get_answer1(df) answer2 = get_answer2(df) assert answer1 == 11 assert answer2 == 6
def splunk(fin: str, fout: str): in_df: DataFrame = read_csv(Path(fin)) in_df = in_df[in_df['valid_row']] in_df = in_df.astype({'duplex_valid':bool}) in_df = in_df[in_df['duplex_valid']] print(in_df['duplex_valid'].unique()) in_df.drop(columns=SPLUNK_COL_TO_DROP, inplace=True) Path(fout).parent.mkdir(parents=True, exist_ok=True) in_df.to_csv(fout)
def feature_extraction(fin: str, fout: str): in_df: DataFrame = read_csv(Path(fin)) valid_df = in_df.query("valid_row & duplex_valid=='True'") feature_df = df_feature_extractor(valid_df) result = pd.merge(left=in_df, right=feature_df, left_index=True, right_index=True, how='left') to_csv(result, Path(fout))
def fast_blast_file(fin: Path, fout: Path, db_title: str): logger.info(f"fast blast file {fin} against {db_title}") in_df: DataFrame = read_csv(fin) seq_file = BIOMART_DATA_PATH / f"{db_title}.csv" df_contains_db_title = partial(df_contains, df=pd.read_csv(seq_file)) in_df["blast sequence"] = in_df.apply(func=get_wrapper( df_contains_db_title, "site"), axis=1) to_csv(in_df, fout)
def rna_insertion(fin_full_path: Path, fout_full_path: Path, rna_df: DataFrame): logger.info(f"Insert rna sequence to {fin_full_path}") df: DataFrame = read_csv(fin_full_path) join_df = df.merge(rna_df, how="left", left_on="mRNA ID", right_on="mRNA ID", validate="many_to_one") to_csv(join_df, fout_full_path) logger.info(f"Finish the rna sequence insertion to {fin_full_path}")
def read_blast_result_file(fin: Path) -> DataFrame: logger.info(f"read_blast_result_file {fin}") df: DataFrame = read_csv(Path(fin)) df = df.astype({'s.start': 'Int32', 's.end': 'Int32'}, errors="ignore") region = fin.stem.split("_")[-1] df["region"] = region # take only the rows with valid results df.dropna(axis=0, how='any', subset=['sequence'], inplace=True) return df
def test_get_answer(): df = read_csv('test/2020/data/day4.csv', col_names=['passport'], sep=',', skip_blank_lines=False) answer1 = get_answer1(df) assert answer1 == 2 df_valid = read_csv('test/2020/data/day4_valid.csv', col_names=['passport'], sep=',', skip_blank_lines=False) answer2_valid = get_answer2(df_valid) assert answer2_valid == 4 df_invalid = read_csv('test/2020/data/day4_invalid.csv', col_names=['passport'], sep=',', skip_blank_lines=False) answer2_invalid = get_answer2(df_invalid) assert answer2_invalid == 0
def insert_site_by_coordinates(fin: str, fout: str): logger.info(f"Insert site to {fin}") df: DataFrame = read_csv(Path(fin)) df["site"] = df.apply(func=get_wrapper(get_subsequence_by_coordinates, "mRNA sequence", "chimera_start", "chimera_end", extra_chars=SITE_EXTRA_CHARS), axis=1) to_csv(df, Path(fout)) logger.info(f"finish the site sequence insertion to {fin}")
def blast_file(fin: Path, fout: Path, db_title: str): logger.info(f"blast file {fin} against {db_title}") in_df: DataFrame = read_csv(fin) blastn_df: DataFrame = in_df.apply(func=get_wrapper(run_blastn, "site", db_title=db_title), axis=1) result = pd.concat([in_df, blastn_df], axis=1) # in_df["blast region"] = in_df["blast sequence"].apply(lambda x: "" if np.isnan(x) else db_title) to_csv(result, fout)
def duplex(method: str, fin: str, fout: str): duplex_cls: Duplex = DUPLEX_DICT[method] logger.info(f"{method} do_duplex to {fin}") in_df: DataFrame = read_csv(Path(fin)) # [in_df["miRNA sequence"].notnull() & in_df.site.notnull()] duplex_df = in_df.query("valid_row").apply(func=get_wrapper( do_duplex, "miRNA sequence", "site", cls=duplex_cls), axis=1) result = pd.merge(left=in_df, right=duplex_df, left_index=True, right_index=True, how='left') result["duplex_method"] = method to_csv(result, Path(fout))