def cluster(seqfile, seqnames, identity=1.0, prefix='cluster-', threads=None): with util.ntf(prefix=prefix, suffix='.fasta') as fa, \ util.ntf(prefix=prefix, suffix='.uc') as uc: wrap.esl_sfetch(seqfile, seqnames, fa) fa.flush() uclust.cluster(fa.name, uc.name, pct_id=identity, pre_sorted=False, quiet=True, threads=threads) df = uclust.parse_uclust_as_df(uc) df = df[df.type != 'C'] df = df[['type', 'query_label', 'target_label']] return df
def cluster(seqfile, seqnames, identity=1.0, prefix='cluster-', threads=None): prefix = prefix.replace('/', '\\') # / confuses the filesystem with util.ntf(prefix=prefix, suffix='.fasta') as fa, \ util.ntf(prefix=prefix, suffix='.uc') as uc: wrap.esl_sfetch(seqfile, seqnames, fa) fa.flush() uclust.cluster(fa.name, uc.name, pct_id=identity, pre_sorted=False, quiet=True, threads=threads) df = uclust.parse_uclust_as_df(uc) df = df[df.type != 'C'] df = df[['type', 'query_label', 'target_label']] return df
def add_clusters_to_refpkg(refpkg, **kwargs): with refpkg.open_resource('taxonomy') as tax_fp: tax = taxtable.read(tax_fp) with refpkg.open_resource('seq_info') as sinfo_fp: reader = csv.DictReader(sinfo_fp) sinfo = list(reader) # Annotate add_cluster_taxids(tax, sinfo, **kwargs) with util.ntf(prefix='seq_info-', suffix='.csv') as seqinfo_tf, \ util.ntf(prefix='taxonomy-', suffix='.csv') as tax_tf: w = csv.DictWriter(seqinfo_tf, reader.fieldnames) w.writeheader() w.writerows(sinfo) seqinfo_tf.close() tax.write_taxtable(tax_tf) tax_tf.close() refpkg.start_transaction() refpkg.update_file('seq_info', seqinfo_tf.name) refpkg.update_file('taxonomy', tax_tf.name) refpkg.commit_transaction()