Ejemplo n.º 1
0
 def parse_headers(self, fields, sep='|', strip='_'):
     '''
     split the sequence description and add annotations to sequences
     '''
     try:
         assert ("strain" in fields.values())
     except AssertionError:
         self.log.fatal("Config file: fasta_fields must contain 'strain'")
     for seq in self.seqs.values():
         if not hasattr(seq, "attributes"): seq.attributes = {}
         words = map(lambda x: fix_names(x),
                     seq.description.replace(">", "").split(sep))
         for ii, val in enumerate(words):
             if ii in fields:
                 if val not in ["", "-"]:
                     # self.log.debug("{} -> {}".format(fields[ii], val))
                     seq.attributes[fields[ii]] = val
                 else:
                     seq.attributes[fields[ii]] = ""
     self.seqs = {
         seq.attributes['strain']: seq
         for seq in self.seqs.values()
     }
     for seq in self.seqs.values():
         seq.id = seq.attributes['strain']
         seq.name = seq.attributes['strain']
Ejemplo n.º 2
0
    def load_reference(self, path, fmts, metadata, include=2, genes=False):
        """Assume it's genbank."""
        try:
            self.reference = SeqIO.read(path, 'genbank')
        except Exception as e:
            self.log.fatal("Problem reading reference {}. Error: {}".format(path, e))

        ## some checks
        try:
            assert("strain" in metadata)
            if include > 0:
                assert("date" in metadata)
        except AssertionError as e:
            self.log.fatal("Poorly defined reference. Error:".format(e))

        if genes:
            # we used to make these FeatureLocation objects here, but that won't go to JSON
            # so just do it in the Process part instead. For reference:
            # FeatureLocation(start=f.location.start, end=f.location.end, strand=1)
            self.reference.genes = {
                sequence_set.get_gene_name(f.qualifiers['gene'][0], genes): {"start": int(f.location.start),
                                           "end": int(f.location.end), "strand": f.location.strand}
                for f in self.reference.features
                if 'gene' in f.qualifiers and f.qualifiers['gene'][0] in genes
            }
        else:
            self.reference.genes = {}

        # use the supplied metadata dict to define attributes
        seq_attr_keys = self.seqs.values()[0].attributes.keys()
        self.reference.attributes = {k:fix_names(v) for k,v in metadata.items() if k in seq_attr_keys}
        self.reference.name = self.reference.attributes["strain"]
        self.reference.id = self.reference.attributes["strain"]

        # is there any possibility that the reference will be added to the sequences?
        self.reference.include = include; # flag {0,1,2}
        if self.reference.name in self.seqs:
            self.log.notify("Segment {} reference already in dataset".format(self.segmentName))
            if include == 0:
                self.log.notify("Removing reference from pool of sequences to analyse")
                del self.seqs[self.reference.name]
        elif include > 0:
            ## add to sequences (tidy up attributes first)
            self._parse_date_per_seq(self.reference, fmts)
            self.seqs[self.reference.name] = self.reference
            missing_attrs = set(seq_attr_keys) - set(self.reference.attributes.keys()) - set(["date", "num_date"])
            if len(missing_attrs) > 0:
                self.log.notify("Including reference in segment {} but the following attributes are missing: {}".format(self.segmentName, " & ".join(missing_attrs)))