def __init__(self, config, workflow_name="", validate_barcodes=True): self._config = config self._workflow_name = workflow_name self._locus = None bc_ids = config.get("BARCODE_IDS", "[]") if isinstance(bc_ids, str): self._barcode_ids = yaml.load(bc_ids) else: self._barcode_ids = bc_ids try: with open(config["BARCODES"], "r") as bc_file: self._all_barcodes = [line.strip()[1:] for line in bc_file if line.startswith(">")] except KeyError: self._all_barcodes = [] #raise WorkflowError("Barcode file not specified") except IOError: raise WorkflowError("Could not load barcodes") if validate_barcodes: if len(self._barcode_ids) and len(self._all_barcodes): assert all((x in self._all_barcodes for x in self._barcode_ids)), "barcode id not in barcode file" if len(self._barcode_ids) == 0: if len(self._all_barcodes) > 0: self._barcode_ids = self._all_barcodes else: raise WorkflowError("No valid barcodes provided") self._genes = {} for locus_file in config.get("LOCI", []): try: locus = locus_processing.load_locus_yaml(locus_file) except IOError: raise WorkflowError("Locus definition file {} does not exist".format(locus_file)) except ValueError: raise WorkflowError("{} is not a valid locus definition".format(locus_file)) self._genes[locus.name] = locus_file
phase = int(fields[2].split("haplotype")[-1]) return counts[(cluster, phase)] def count_passes(allele_id, phasing): if phasing is None: return 0 counts = phasing.groupby(["cluster", "phase"])["np"].sum() fields = allele_id.split(".") cluster = int(fields[1].split("cluster")[-1]) phase = int(fields[2].split("haplotype")[-1]) return counts[(cluster, phase)] gene = locus_processing.load_locus_yaml(snakemake.input.gene) def summarize_alleles(barcode): alleles = load_alleles( next(f for f in snakemake.input.haplotypes if barcode in f)) vep = load_vep(next(f for f in snakemake.input.vep if barcode in f)) last = load_last(next(f for f in snakemake.input.last if barcode in f)) phasing = load_phasing_summary( next(f for f in snakemake.input.phasing if barcode in f)) num_alleles = len(alleles) first = True allele_info = [] for allele in alleles:
""" Use PyBedTools to generate a fasta file containing the sequence for a single region """ from Bio import SeqIO import pybedtools import locus_processing import yaml locus = locus_processing.load_locus_yaml(snakemake.input.locus) try: with open(snakemake.config["EXPERIMENT"], "r") as infile: experiment = yaml.safe_load(infile) start_pos = experiment["targets"][0]["primers"][0]["forward"]["start"] end_pos = experiment["targets"][0]["primers"][0]["reverse"]["end"] except (KeyError, IOError): start_pos = locus.coordinates.start end_pos = locus.coordinates.end # create a bed tool for the required region bed_tool = pybedtools.BedTool([(locus.chromosome.name, start_pos - 1, end_pos)]) # associate the bedtool with the reference genome fasta bed_tool = bed_tool.sequence(fi=snakemake.input.genome) # get the sequence and save it with open(snakemake.output[0], "w") as outfile: sequence = SeqIO.read(bed_tool.seqfn, "fasta") SeqIO.write(sequence, outfile, "fasta")