def make_bigwig(genomic_fasta, kmer_score_h5, motif_id, score_column, nlargest, output_file): print('Reading scores') kmer_score_lookup = read_score_lookup_array(kmer_score_h5, motif_id, score_column, nlargest) shp = dinopy.shape.Shape(K_MER_LENGTH) far = dinopy.FastaReader(genomic_fasta) header = [] for __, chromosome, length, __ in far.entries(): header.append((chromosome.decode(), length)) with pyBigWig.open(output_file, 'w') as bw: bw.addHeader(header) for sequence, chromosome, length, interval in far.entries(): chromosome = chromosome.decode() values = consume_sequence(kmer_score_lookup, sequence) # Cast to int to get more repeating values # values = np.asarray(values, dtype=int) pos = 0 bw.addEntries(chromosome, 0, values=values, span=1, step=1)
def make_bigwig(genomic_fasta, kmer_score_h5, motif_id, score_column, nlargest, output_file): kmer_dict = get_kmer_dict(kmer_score_h5, motif_id, score_column, nlargest) far = dinopy.FastaReader(genomic_fasta) with open(output_file, 'w') as bed_file: for sequence, chromosome, length, interval in far.entries(): chromosome = chromosome.decode() sequence = sequence.decode() for start, kmer in enumerate(iter_kmers(sequence, k=K_MER_LENGTH)): try: name = kmer_dict[kmer] except KeyError: try: kmer = dinopy.reverse_complement(kmer) name = kmer_dict[kmer] except KeyError: continue end = start + K_MER_LENGTH bed_file.write(f'{chromosome}\t{start}\t{end}\t{name}\n')
def get_stacks_data(args): """Read in stacks VCF file.""" loc_seqs = dict() haplotypes_file = pysam.VariantFile(args.stacks_haplo, 'r') indexed_far = dp.FastaReader(args.stacks_fa) record = None last_locus = None chromosome = None # merge consecutive lines describing SNPs on the same locus. for variant_record in haplotypes_file: chromosome = variant_record.chrom if record is None: seq = list(indexed_far[chromosome])[0].sequence record = VCFRecord(chromosome, seq, [variant_record], False) last_locus = variant_record.chrom elif variant_record.chrom == last_locus: record.data.append(variant_record) else: loc_seqs[last_locus] = record seq = list(indexed_far[chromosome])[0].sequence record = VCFRecord(chromosome, seq, [variant_record], False) last_locus = variant_record.chrom # print("LOC SEQS", loc_seqs) # write the last record if chromosome is not None: loc_seqs[chromosome] = record # add all remaining loci without variants to the dictionary # so that they can be compared with the ground truth far = dp.FastaReader(args.stacks_fa) for seq, name, *_ in far.chromosomes(): # Split off the second part of stacks locus names. # In the vcf files, the information is not included chromosome = name.decode().split()[0] # add a record without variants (variant record) for loci without # variants detected by stacks. if chromosome not in loc_seqs: loc_seqs[chromosome] = VCFRecord(chromosome, seq, [], False) # print("LOC SEQS 2", loc_seqs) return list(loc_seqs.values())
def fasta2dazzdb(args: argparse.Namespace): """Fix the FASTA/FASTQ header/id's to a DAZZ_DB compatible format such that these reads can be imported.""" file_format = args.format if not file_format: if args.input != sys.stdin: filename = args.input.name file_ext = filename[filename.rfind('.')+1:] file_format = 'fastq' if file_ext in ('fq', 'fastq') else 'fasta' if not file_format: logger.error("Could not determine file format. Please specify using " "the -f option.") return if file_format == 'fastq': seq_iter = iter(dinopy.FastqReader(args.input).reads( quality_values=False)) else: seq_iter = iter(dinopy.FastaReader(args.input).reads(read_names=True)) if args.input == sys.stdin: name = args.name if args.name else random_string(10) else: name = os.path.basename(args.input.name) moviename = daligner.generate_moviename_hash(name) name_mapping = {} seq_iter = iter(daligner.fix_header(seq_iter, moviename, name_mapping)) logger.info("Converting FASTA/FASTQ entries...") with dinopy.FastaWriter(args.output, force_overwrite=True) as fw: fw.write_entries(seq_iter) if args.translations: logger.info("Writing name mappings to file...") json.dump(name_mapping, args.translations) logger.info("Done.")
def overlap(args): args.output.write(gfa.gfa_header()) overlapper = ExactOverlapper() fr = dinopy.FastaReader(args.fasta_input) logger.info("Building suffix tree and searching for pairwise overlaps...") for entry in fr.entries(): name = entry.name.decode('utf-8') seq = entry.sequence.decode('utf-8') args.output.write(gfa.gfa_line("S", name, entry.length, "*")) overlapper.add_sequence(name + "+", seq) overlapper.add_sequence(name + "-", dinopy.reverse_complement(seq)) overlaps = overlapper.overlaps(args.min_length) logger.info("Writing to GFA2...") for aread, bread, astart, aend, bstart, bend in overlaps: args.output.write(gfa.gfa_line( "E", "*", aread, bread, astart, aend, bstart, bend, "*")) logger.info("Done.")
if "errorprob" in res.content.decode(): res_all.append(res.content.decode()) c += 1 if c%1000 == 0: print(c) except: print("Max retries, going to sleep for 100 sec.") print("j= " + str(j)) sleep(100) run_requests(j, res_all, entry_list, c, payload, header, url) if __name__ == '__main__': url = 'https://mesa.mosla.de/api/all' #'http://137.248.121.201:5000/api/all' header = {'content-type': 'application/json;charset=UTF-8'} with open("mesa.json") as json_file: config = json.load(json_file) f = dp.FastaReader("mcgr_test.fasta") payload = config payload['asHTML'] = False payload["key"] = '' c = 0 res_all = list() i = 0 entry_list = list(f.entries()) run_requests(i, res_all, entry_list, c, payload, header, url) with open("results.txt", "w") as f_: for ent in res_all: f_.write(ent) f_.write("\n")
import dinopy import pandas as pd import tables import numpy as np f_names = [] uniq_seqs = [] for i in range(len(snakemake.input)): seqs = dinopy.FastaReader(snakemake.input[i]) uniq_seqs = uniq_seqs + list( set([entry.sequence.decode() for entry in seqs.entries()])) uniq_seqs = set(uniq_seqs) # create empty matrix and fill, all other solutions cost too much memory sample_names = [i.split("/")[-1].split(".")[0] for i in snakemake.input] df = pd.DataFrame(0, index=uniq_seqs, columns=sample_names, dtype=np.uint16) # fill matrix for i in range(len(snakemake.input)): sample_name = sample_names[i] seqs = dinopy.FastaReader(snakemake.input[i]) for entry in seqs.entries(): seq = entry.sequence.decode() value = np.uint16(entry.name.decode().split("size=")[1].split(";")[0]) df.at[seq, sample_name] = value # save to file df.index.name = "sequences" df.to_hdf(snakemake.output[1], key='df', mode='w') df.to_csv(snakemake.output[0])
import pandas as pd import logging import dinopy far = dinopy.FastaReader(snakemake.input[0]) header = [i.name.decode().split(" ", 1) for i in far.entries()] df = pd.DataFrame(header, columns=["id", "taxonomy"]) df = df.set_index(keys="id", drop=True) df.to_hdf(snakemake.output[0], key='df', mode='w')
def sequence_dict(fasta_file): return {entry.name:entry.sequence for entry in dinopy.FastaReader(str(fasta_file)).entries()}
def __init__(self, file_source): super().__init__() self.reader = dinopy.FastaReader(file_source)