def parse_headers(self, fields, sep='|', strip='_'): ''' split the sequence description and add annotations to sequences ''' try: assert ("strain" in fields.values()) except AssertionError: self.log.fatal("Config file: fasta_fields must contain 'strain'") for seq in self.seqs.values(): if not hasattr(seq, "attributes"): seq.attributes = {} words = map(lambda x: fix_names(x), seq.description.replace(">", "").split(sep)) for ii, val in enumerate(words): if ii in fields: if val not in ["", "-"]: # self.log.debug("{} -> {}".format(fields[ii], val)) seq.attributes[fields[ii]] = val else: seq.attributes[fields[ii]] = "" self.seqs = { seq.attributes['strain']: seq for seq in self.seqs.values() } for seq in self.seqs.values(): seq.id = seq.attributes['strain'] seq.name = seq.attributes['strain']
def load_reference(self, path, fmts, metadata, include=2, genes=False): """Assume it's genbank.""" try: self.reference = SeqIO.read(path, 'genbank') except Exception as e: self.log.fatal("Problem reading reference {}. Error: {}".format(path, e)) ## some checks try: assert("strain" in metadata) if include > 0: assert("date" in metadata) except AssertionError as e: self.log.fatal("Poorly defined reference. Error:".format(e)) if genes: # we used to make these FeatureLocation objects here, but that won't go to JSON # so just do it in the Process part instead. For reference: # FeatureLocation(start=f.location.start, end=f.location.end, strand=1) self.reference.genes = { sequence_set.get_gene_name(f.qualifiers['gene'][0], genes): {"start": int(f.location.start), "end": int(f.location.end), "strand": f.location.strand} for f in self.reference.features if 'gene' in f.qualifiers and f.qualifiers['gene'][0] in genes } else: self.reference.genes = {} # use the supplied metadata dict to define attributes seq_attr_keys = self.seqs.values()[0].attributes.keys() self.reference.attributes = {k:fix_names(v) for k,v in metadata.items() if k in seq_attr_keys} self.reference.name = self.reference.attributes["strain"] self.reference.id = self.reference.attributes["strain"] # is there any possibility that the reference will be added to the sequences? self.reference.include = include; # flag {0,1,2} if self.reference.name in self.seqs: self.log.notify("Segment {} reference already in dataset".format(self.segmentName)) if include == 0: self.log.notify("Removing reference from pool of sequences to analyse") del self.seqs[self.reference.name] elif include > 0: ## add to sequences (tidy up attributes first) self._parse_date_per_seq(self.reference, fmts) self.seqs[self.reference.name] = self.reference missing_attrs = set(seq_attr_keys) - set(self.reference.attributes.keys()) - set(["date", "num_date"]) if len(missing_attrs) > 0: self.log.notify("Including reference in segment {} but the following attributes are missing: {}".format(self.segmentName, " & ".join(missing_attrs)))