import os print("start dedup") fasta = {} for ID, seq in fasta_iter(infile): if seq in fasta: fasta[seq][1] += 1 else: fasta[seq] = [ID, 1] outfile1 = infile.replace('.faa.gz', '.raw_number.tsv.gz') outfile2 = infile.replace('.faa.gz', '.dedup.faa.gz') out1 = gzip.open(outfile1, "wt", compresslevel=1) out2 = gzip.open(outfile2, "wt", compresslevel=1) print("start sort") for seq, (ID, count) in sorted(fasta.items()): out1.write(f"{count}\t{seq}\n") out2.write(f">{ID}\n{seq}\n") out1.close() out2.close() os.unlink(infile) print("finish dedup and sort") return (outfile1, outfile2) INPUT_FILE = "data/GMSC10.metag_smorfs.faa.gz" splits = splitseq(INPUT_FILE) for sp in bvalue(splits): dedup_fasta(sp)
def create_ena_file_map(studies_tables, vol_map, MIRROR_BASEDIR): def annotate_link(p): def drop_hostname(addr): while not addr.startswith("vol1"): newaddr = addr.split('/', 1) if len(newaddr) == 1: raise ValueError("Couldn't find mirror root") addr = newaddr[-1] return addr if pd.isnull(p): return (p, p) p = drop_hostname(p) if p.endswith('_1.fastq.gz'): return ("fastq_1", p) if p.endswith('_2.fastq.gz'): return ("fastq_2", p) if p.endswith('.fastq.gz'): return ("fastq_single", p) raise ValueError("Cannot annotate {}".format(p)) with open(path.join(MIRROR_BASEDIR, vol_map), 'w') as out: out.write( "#study_accession\trun_accession\tsample_accession\texperiment_accession\tfastq_1\tfastq_2\tfastq_single\n" ) for study in studies_tables: data = bvalue( studies_tables[study]) ## bvalue because this is a Tasklet if data is None: print("We got no file information for", study, ". Incomplete/older jug internal state? Skipping", study) continue annotations = data["ftp"].map(annotate_link) data["filetype"], data["filepath"] = zip(*annotations) subdata = data[[ "study_accession", "run_accession", "sample_accession", "experiment_accession" ]].drop_duplicates() files = data.pivot(values="filepath", columns="filetype") grouped = pd.merge(subdata, files, left_index=True, right_index=True) for _, record in grouped.iterrows(): # 6 columns: project_accession, sample_accession, experiment_accession, fastq_1, fastq_2, fastq_single try: fastq_1 = record.fastq_1 except: fastq_1 = '' else: if fastq_1 is None: fastq_1 = '' else: fastq_1 = path.join(MIRROR_BASEDIR, fastq_1) try: fastq_2 = record.fastq_2 except: fastq_2 = '' else: if fastq_2 is None: fastq_2 = '' else: fastq_2 = path.join(MIRROR_BASEDIR, fastq_2) try: fastq_single = record.fastq_single except: fastq_single = '' else: if fastq_single is None: fastq_single = '' else: fastq_single = path.join(MIRROR_BASEDIR, fastq_single) out.write( f"{record.study_accession}\t{record.run_accession}\t{record.sample_accession}\t{record.experiment_accession}\t{fastq_1}\t{fastq_2}\t{fastq_single}\n" )
max_lengthscale=1000.0, max_variance=1000.0)) experiment_storage_path, _, common_run_settings, dataset_custom_settings = get_settings( dataset_name) baseline_exps[dataset_name] = FullbatchUciExperiment( **{ **common_run_settings, **dataset_custom_settings, **baseline_custom_settings }) ( all_model_parameters[dataset_name], full_rmses[dataset_name], full_nlpps[dataset_name], baseline_lmls[dataset_name], ) = jug.bvalue(run_baseline(baseline_exps[dataset_name])) @jug.TaskGenerator def run_sparse_init(exp): print(exp) exp.setup_model() exp.init_params() print_post_run(exp) elbo, upper, rmse, nlpp = compute_model_stats(exp) return elbo, upper, rmse, nlpp # Sparse experiments init_Z_runs = {} init_Z_task_results = {}
from jug import barrier, Task, bvalue import math def double(x): return 2 * x two = Task(double, 1) two = bvalue(two) four = 2 * two
topology=cfg['topology'], sasa_sidechain_h5=sc_sasa_h5, cluster_distance=cfg['cluster_distance_metric'], cluster_radii=cfg['cluster_radii'], kmedoids_updates=10) selected_cluster_results = {} for protein, cluster_result_list in cluster_results.items(): if "lag_time" not in CONFIGS[protein]: continue for radius, cluster_result in zip(cfg['cluster_radii'], cluster_result_list): if CONFIGS[protein]["model_cluster_radius"] == radius: selected_cluster_results[protein] = cluster_result lag_time = CONFIGS[protein]["lag_time"] if cluster_result.assignments.can_load(): dirname, assigs_file = os.path.split( jug.bvalue(cluster_result.assignments)) dirname = os.path.join(os.path.split(dirname)[0], 'models') msm_filename = assigs_file.replace( '-assignments.h5', '-%02dprior-%slt-msm' % (prior_count, lag_time)) assignments = cluster_result.assignments msm2file(os.path.join(dirname, msm_filename), assignments, lag_time=lag_time)
oname = deeparg_output+'.hamronized' with open(oname, 'wb') as out: subprocess.check_call([ 'conda', 'run', '-n', 'hamronization', 'hamronize', 'deeparg', '--input_file_name', deeparg_input, '--analysis_software_version', deeparg_software_version, '--reference_database_version', deeparg_db_version, deeparg_output + '.mapping.ARG'], stdout=out) return oname splits_faa = split_seq_file('data/GMGC10.wastewater.95nr.test_10k.faa.gz') partials = [] for faa in (bvalue(splits_faa)): partials.append( run_rgi_hamronize(faa, run_rgi(faa))) concat_partials(partials, 'outputs/rgi.full.tsv.gz') splits_fna = split_seq_file('data/GMGC10.wastewater.95nr.test_10k.fna.gz') for fa in bvalue(splits_fna): for db in ['resfinder', 'card','argannot','ncbi','megares']: run_abricate_hamronize(run_abricate(fa, db)) run_deeparg_hamronize(fa, run_deeparg(fa)) @TaskGenerator def run_hamronize_summarize(reports, combined): '''Combine outputs of all the tools
from jug import barrier, Task, bvalue import math def double(x): return 2*x two = Task(double,1) two = bvalue(two) four = 2*two
def cluster( tag, trajectories, topology, sasa_sidechain_h5, cluster_radii, cluster_algorithm='khybrid', cluster_distance='euclidean', kmedoids_updates=5): import os from collections import namedtuple sc_sasa_filename = jug.bvalue(sasa_sidechain_h5) def make_cluster_name(sc_sasa_filename, suffix): f = sc_sasa_filename \ .rstrip('sasas-sidechains.h5') \ .replace('/features', '/cluster/') + \ "-".join(['sidechains', cluster_algorithm, radius, cluster_distance] + ( ['kmedoids' + str(kmedoids_updates)] if cluster_algorithm == 'khybrid' else [])) return f ClusterFiles = namedtuple( 'ClusterFiles', ['assignments', 'distances', 'center_indices', 'structure_centers', 'feature_centers']) cluster_results = [] for radius in cluster_radii: CLUSTER_STEM = sc_sasa_filename \ .replace('sasas-sidechains.h5', '') \ .replace('/features/', '/cluster/') + \ "-".join(['sidechains', cluster_algorithm, radius, cluster_distance] + ( ['kmedoids' + str(kmedoids_updates)] if cluster_algorithm == 'khybrid' else [])) ASSIGNMENTS_FILE = CLUSTER_STEM + '-assignments.h5' DISTANCES_FILE = CLUSTER_STEM + '-distances.h5' FEATURE_CENTERS_FILE = CLUSTER_STEM + '-feature-centers.npy' CENTER_INDS_FILE = CLUSTER_STEM + '-center-inds.npy' assignments, distances, center_features, center_indices = \ jug.iteratetask(tasks.cluster_features( sasa_sidechain_h5, ASSIGNMENTS_FILE, DISTANCES_FILE, FEATURE_CENTERS_FILE, CENTER_INDS_FILE, radius, 'euclidean', cluster_algorithm, cluster_iterations=kmedoids_updates ), n=4) ctr_structs = tasks.write_struct_ctrs( trajectories, topology, center_indices, CLUSTER_STEM + "-structure-centers.h5") result = ClusterFiles( assignments=assignments, distances=distances, center_indices=center_indices, feature_centers=center_features, structure_centers=ctr_structs) cluster_results.append(result) PLOT_PATH = os.path.join( 'figures', 'implied', os.path.basename(CLUSTER_STEM) + 'implied-timescales.png') tasks.implied_timescales( assignments=assignments, plot_path=PLOT_PATH ) return cluster_results