Esempio n. 1
0
def create_gene_feature(gene_name, feature_location, feature_qualifiers):
    """Creates a minimal SeqFeature to represent a gene.
    """
    gene_feature = SeqFeature(feature_location, type='gene')
    gene_feature.qualifiers = {'gene': [gene_name]}
    gene_feature.qualifiers = dict(gene_feature.qualifiers.items() +
            feature_qualifiers.items())
    return gene_feature
Esempio n. 2
0
def write_gbk(results, query_dic, out_dir, out_prefix):
    rec_dic = {}
    results.sort(key=operator.itemgetter('qid'))
    for data in results:
        key = data['qid']
        if key not in rec_dic:
            rec_dic[key] = [data]
        else:
            rec_dic[key].append(data)

    # list keys of the dic
    keys_list = list(rec_dic.keys())

    # sort list of keys
    keys = sorted(keys_list)

    n = 0
    for key in keys:
        records = rec_dic[key]
        rec = SeqRecord(Seq(str(query_dic[key].seq)), id=key, name=key, description='',
                        annotations={"molecule_type": "DNA"})
        for data in records:
            if test_cds(data) == 1:
                feature = SeqFeature(FeatureLocation(data['qstart'] - 1, data['qend'], strand=data['strand']),
                                     type='CDS', qualifiers={})
            else:
                feature = SeqFeature(FeatureLocation(data['qstart'] - 1, data['qend'], strand=data['strand']),
                                     type='misc_feature', qualifiers={})

            if 'qprot' in data:
                # feature.qualifiers = {'locus_tag':'{0}_{1}'.format(out_prefix, n), 'product':data['tid'].
                # split('::')[0],
                #                  'note':description(data), 'translation':data['qprot']}
                feature.qualifiers = OrderedDict([('product', data['tid'].split('::')[0]),
                                                  ('note', description(data)), ('translation', data['qprot'])])
            else:
                # feature.qualifiers = {'locus_tag':'{0}_{1}'.format(out_prefix, n),
                # 'product':data['tid'].split('::')[0],
                #                   'note':description(data)}
                feature.qualifiers = OrderedDict([('product', data['tid'].split('::')[0]),
                                                  ('note', description(data))])
            rec.features.append(feature)

        rec.features = sorted(rec.features, key=lambda feature: feature.location.start)

        for feature in rec.features:
            feature.qualifiers = OrderedDict([('locus_tag', f'DET_{n + 1}')] + list(feature.qualifiers.items()))
            n += 1

        out_file = os.path.join(out_dir, f'{str(rec.id).replace(".", "-")}.gbk')
        with open(out_file, 'w') as out_f:
            SeqIO.write([rec], out_f, 'genbank')
Esempio n. 3
0
def divide_genome(genome):
    output_genes = []
    new_record = ""
    for chrom in genome:
        for feat in chrom.features:
            if feat.type == 'gene':
                if new_record:
                    output_genes.append(new_record)
                record_sequence = feat.extract(chrom.seq)
                new_record = SeqRecord(record_sequence)
                new_record.id = feat.qualifiers['locus_tag'][0]
                new_record.description = chrom.description
                new_record.name = feat.qualifiers['locus_tag'][0]
                new_location = FeatureLocation(
                    feat.location.start - feat.location.start,
                    feat.location.end - feat.location.start)
                new_feat = SeqFeature(location=new_location,
                                      type=feat.type,
                                      strand=feat.strand,
                                      ref=feat.ref,
                                      ref_db=feat.ref_db)
                new_feat.qualifiers = feat.qualifiers
                new_record.features = [new_feat]
                new_record.annotations['topology'] = chrom.annotations[
                    'topology']
                new_record.annotations['date'] = chrom.annotations['date']
                new_record.annotations['taxonomy'] = chrom.annotations[
                    'taxonomy']
                new_record.annotations['source'] = chrom.annotations['source']
                new_record.annotations['organism'] = chrom.annotations[
                    'organism']
                new_record.annotations['sequence_version'] = chrom.annotations[
                    'sequence_version']
                new_record.annotations[
                    'data_file_division'] = chrom.annotations[
                        'data_file_division']
                new_record.annotations['references'] = chrom.annotations[
                    'references']
            else:
                new_location = FeatureLocation(
                    feat.location.start - feat.location.start,
                    feat.location.end - feat.location.start)
                new_feat = SeqFeature(location=new_location,
                                      type=feat.type,
                                      strand=feat.strand,
                                      ref=feat.ref,
                                      ref_db=feat.ref_db)
                new_feat.qualifiers = feat.qualifiers
                new_record.features.append(new_feat)
    return output_genes
	def parse(self):
		with open(self._file) as handle:
			genbank = SeqRecord(Seq.UnknownSeq(0))
			header_pattern = re.compile(r"ref\|(?P<id>.*?)\|:(?P<start>[0-9]+)-(?P<end>[0-9]+)\|(?P<description>.*?)\|\s*\[gene=(?P<gene>\S+)\]\s*\[locus_tag=(?P<locus_tag>\S+)\]\s*")
			first = True			
			for record in SeqIO.parse(handle, "fasta"):
				header = record.description
				match = header_pattern.match(header)
				if not match:
					self.errors.append("Invalid header: >" + header)
					continue
				
				if first:
					first = False
					genbank.id = match.group("id")
					genbank.name = match.group("id")
				
				feature = SeqFeature(FeatureLocation(int(match.group("start")), int(match.group("end"))), type = "gene")
				feature.qualifiers = {"locus_tag": match.group("locus_tag"),
							"gene": match.group("gene"),
							"note": match.group("description"),
							"sequence": record.seq}
				genbank.features.append(feature)
			
			return genbank
		return None
Esempio n. 5
0
def get_genome_seqrecord_features(phage_genome):
    """Helper function that uses Genome data to populate
    the features SeqRecord atribute

    :param phage_genome:
        Input a Genome object.
    :type phage_genome: genome
    :returns:
        features is a list of SeqFeature objects parsed
        from cds objects
    """

    source_feature = SeqFeature(FeatureLocation(0, phage_genome.length),
                                strand=1,
                                type="source")
    source_feature.qualifiers = OrderedDict()
    source_feature.qualifiers["source"] = (f"{phage_genome.host_genus} phage "
                                           f"{phage_genome.name}")

    features = [source_feature]

    for phage_cds in phage_genome.cds_features:
        phage_cds.set_seqfeature(type="gene")
        features.append(phage_cds.seqfeature)
        phage_cds.set_seqfeature(type="CDS")
        features.append(phage_cds.seqfeature)

    for phage_trna in phage_genome.trna_features:
        phage_trna.set_seqfeature(type="gene")
        features.append(phage_trna.seqfeature)
        phage_trna.set_seqfeature()
        features.append(phage_trna.seqfeature)

    return features
Esempio n. 6
0
    def parse(self):
        with open(self._file) as handle:
            genbank = SeqRecord(Seq.UnknownSeq(0))
            header_pattern = re.compile(
                r"ref\|(?P<id>.*?)\|:(?P<start>[0-9]+)-(?P<end>[0-9]+)\|(?P<description>.*?)\|\s*\[gene=(?P<gene>\S+)\]\s*\[locus_tag=(?P<locus_tag>\S+)\]\s*"
            )
            first = True
            for record in SeqIO.parse(handle, "fasta"):
                header = record.description
                match = header_pattern.match(header)
                if not match:
                    self.errors.append("Invalid header: >" + header)
                    continue

                if first:
                    first = False
                    genbank.id = match.group("id")
                    genbank.name = match.group("id")

                feature = SeqFeature(FeatureLocation(int(match.group("start")),
                                                     int(match.group("end"))),
                                     type="gene")
                feature.qualifiers = {
                    "locus_tag": match.group("locus_tag"),
                    "gene": match.group("gene"),
                    "note": match.group("description"),
                    "sequence": record.seq
                }
                genbank.features.append(feature)

            return genbank
        return None
Esempio n. 7
0
def make_protein_feature(feature_name, feature_start, feature_end, feature_type):
    ''' Returns sequence feature, using start, end, name and type as input
    '''
    feature = SeqFeature(FeatureLocation(int(feature_start), int(feature_end)), type=feature_type)
    if feature_type == "Region":
        feature.qualifiers = {'name': [feature_name]}
    return feature
Esempio n. 8
0
def create_cluster_borders(anchor: str, clusters: List[ClusterPrediction],
                           record: Record) -> List[ClusterBorder]:
    """ Create the predicted ClusterBorders """
    if not clusters:
        return []
    borders = []
    for i, cluster in enumerate(clusters):
        # cluster borders returned by hmmdetect are based on CDS features
        # in contrast, cluster borders returned by cassis are based on gene features
        # --> hmmdetect derived clusters have exact loctions, like the CDSs have
        # --> cassis derived clusters may have fuzzy locations, like the genes have
        left_name = cluster.start.gene
        right_name = cluster.end.gene
        left = None
        right = None
        for gene in record.get_genes():
            if gene.get_name() == left_name:
                left = gene
            if gene.get_name() == right_name:
                right = gene
            if left and right:
                break

        new_feature = SeqFeature(FeatureLocation(left.location.start,
                                                 right.location.end),
                                 type="cluster_border")
        new_feature.qualifiers = {
            "aStool": ["cassis"],
            "anchor": [anchor],
            "abundance": [cluster.start.abundance + cluster.end.abundance],
            "motif_score":
            ["{:.1e}".format(cluster.start.score + cluster.end.score)],
            "gene_left": [cluster.start.gene],
            "promoter_left": [cluster.start.promoter],
            "abundance_left": [cluster.start.abundance],
            "motif_left": [cluster.start.pairing_string],
            "motif_score_left": ["{:.1e}".format(cluster.start.score)],
            "gene_right": [cluster.end.gene],
            "promoter_right": [cluster.end.promoter],
            "abundance_right": [cluster.end.abundance],
            "motif_right": [cluster.end.pairing_string],
            "motif_score_right": ["{:.1e}".format(cluster.end.score)],
            "genes": [cluster.genes],
            "promoters": [cluster.promoters],
        }

        if i == 0:
            new_feature.qualifiers["note"] = [
                "best prediction (most abundant) for anchor gene {}".format(
                    anchor)
            ]
        else:
            new_feature.qualifiers["note"] = [
                "alternative prediction ({}) for anchor gene {}".format(
                    i, anchor)
            ]

        new_feature = ClusterBorder.from_biopython(new_feature)
        borders.append(new_feature)
    return borders
Esempio n. 9
0
 def _get_feature(self, feature_dict):
     """Retrieve a Biopython feature from our dictionary representation.
     """
     location = FeatureLocation(*feature_dict['location'])
     new_feature = SeqFeature(location, feature_dict['type'],
             id=feature_dict['id'], strand=feature_dict['strand'])
     new_feature.qualifiers = feature_dict['quals']
     return new_feature
Esempio n. 10
0
 def _get_feature(self, feature_dict):
     """Retrieve a Biopython feature from our dictionary representation.
     """
     location = FeatureLocation(*feature_dict['location'])
     new_feature = SeqFeature(location, feature_dict['type'],
             id=feature_dict['id'], strand=feature_dict['strand'])
     new_feature.qualifiers = feature_dict['quals']
     return new_feature
Esempio n. 11
0
def feature_intervals_to_features(
    features: List[FeatureInterval],
    strand: Strand,
    force_strand: bool,
    feature_name: Optional[str] = None,
    locus_tag: Optional[str] = None,
) -> Iterable[SeqFeature]:
    """Converts a :class:`~biocantor.gene.feature.FeatureInterval` to a :class:`Bio.SeqFeature.SeqFeature`.

    :class:`Bio.SeqFeature.SeqFeature` are BioPython objects that will then be used to write to a GenBank file. There
    is one :class:`Bio.SeqFeature.SeqFeature` for every feature, or row group, in the output file. There will be one
    joined interval at the transcript level representing the exonic structure.

    While transcript members of a gene can have different strands, for GenBank files that is not allowed. This function
    will explicitly force the strand and provide a warning that this is happening.

    Args:
        features: A list of :class:`~biocantor.gene.feature.TranscriptInterval`.
        strand: ``Strand`` that this gene lives on.
        force_strand: Boolean flag; if ``True``, then strand is forced, if ``False``, then improper strands are instead
            skipped.
        feature_name: An optional feature name.
        locus_tag: An optional locus tag.

    Yields:
        A ``SeqFeature``s for each feature.
    """
    for feature in features:
        location = feature._location.to_biopython()

        feature_qualifiers = {
            key: list(vals)
            for key, vals in feature.export_qualifiers().items()
        }
        if feature_name:
            feature_qualifiers["gene"] = [feature_name]
        if locus_tag:
            feature_qualifiers["locus_tag"] = [locus_tag]

        if location.strand != strand.value:
            warn_str = f"Found strand mismatch between gene and feature on feature {feature}. "
            if force_strand:
                warn_str += "Forcing this transcript to the gene orientation."
                warnings.warn(warn_str, StrandViolationWarning)
            else:
                warn_str += "Skipping this transcript."
                warnings.warn(warn_str, StrandViolationWarning)
                continue

        feature = SeqFeature(
            location,
            type=FeatureIntervalFeatures.FEATURE_INTERVAL.value,
            strand=strand.value)
        feature.qualifiers = feature_qualifiers.copy()

        yield feature
Esempio n. 12
0
def make_seq_feature(start, end, ftype, quals={}):
    '''
    create a sequence feature from a start, end, and a type. additionally you 
    may include other fields, like note, label, evidence, citation, as a dict.
    '''
    
    seq_feature = SeqFeature(FeatureLocation(start, end), strand= +1, type=ftype)
    seq_feature.qualifiers = quals
    seq_feature.qualifiers['source'] = ['splicemod']
    return seq_feature
Esempio n. 13
0
 def _get_feature(self, feature_dict):
     """Retrieve a Biopython feature from our dictionary representation."""
     location = FeatureLocation(*feature_dict["location"])
     new_feature = SeqFeature(
         location,
         feature_dict["type"],
         id=feature_dict["id"],
         strand=feature_dict["strand"],
     )
     new_feature.qualifiers = feature_dict["quals"]
     return new_feature
Esempio n. 14
0
def _make_fake_feature(start, end, probability=None, pfam_id=None, type_=None):
    location = FeatureLocation(start, end)
    feature = SeqFeature(location)
    feature.qualifiers = {'note': [], 'db_xref': []}
    if probability is not None:
        feature.qualifiers['note'].append('ClusterFinder probability: %02.4f' %
                                          probability)
    if pfam_id is not None:
        feature.qualifiers['db_xref'].append('PFAM: %s' % pfam_id)
    if type_ is not None:
        feature.type = type_
    return feature
Esempio n. 15
0
def check_sub(feature, sequence) -> List[SeqFeature]:
    """ Recursively checks a GFF feature for any subfeatures and generates any
        appropriate SeqFeature instances from them.
    """
    new_features = []
    locations = []  # type: List[FeatureLocation]
    trans_locations = []  # type: List[FeatureLocation]
    qualifiers = {}  # type: Dict[str, List[str]]
    mismatching_qualifiers = set()  # type: Set[str]
    for sub in feature.sub_features:
        if sub.sub_features:  # If there are sub_features, go deeper
            new_features.extend(check_sub(sub, sequence))
        elif sub.type == 'CDS':
            sub_mismatch = generate_details_from_subfeature(
                sub, qualifiers, locations, trans_locations)
            mismatching_qualifiers.update(sub_mismatch)

    for qualifier in mismatching_qualifiers:
        del qualifiers[qualifier]
    if 'Parent' in qualifiers:
        del qualifiers['Parent']

    # if nothing to work on
    if not new_features and not locations:
        return []

    # Only works in tip of the tree, when there's no new_feature built yet. If there is,
    # it means the script just came out of a check_sub and it's ready to return.
    if not new_features:
        new_loc = locations[0]
        # construct a compound location if required
        if len(locations) > 1:
            locations = sorted(locations, key=lambda x: x.start.real)
            trans_locations = sorted(trans_locations,
                                     key=lambda x: x.start.real)
            if locations[0].strand == 1:
                new_loc = CompoundLocation(locations)
            else:
                new_loc = CompoundLocation(list(reversed(locations)))
                trans_locations = list(reversed(trans_locations))
        # TODO: use new secmet features
        new_feature = SeqFeature(new_loc)
        new_feature.qualifiers = qualifiers
        new_feature.type = 'CDS'
        trans = ''.join([
            n.extract(sequence.seq).translate(stop_symbol='')._data
            for n in trans_locations
        ])
        new_feature.qualifiers['translation'] = [str(trans)]
        new_features.append(new_feature)

    return new_features
Esempio n. 16
0
 def _add_gff_line(self, rec, gff_parts, parents, children):
     """Add details from a GFF line to the given SeqRecord.
     """
     gff_parts = [(None if p == '.' else p) for p in gff_parts]
     assert rec.id == gff_parts[0], "ID mismatch: %s %s" % (rec.id,
                                                            gff_parts[0])
     # collect all of the base qualifiers for this item
     quals = collections.defaultdict(list)
     if gff_parts[1]:
         quals["source"].append(gff_parts[1])
     if gff_parts[5]:
         quals["score"].append(gff_parts[5])
     if gff_parts[7]:
         quals["phase"].append(gff_parts[7])
     for key, val in [a.split('=') for a in gff_parts[8].split(';')]:
         quals[key].extend(val.split(','))
     quals = dict(quals)
     # if we are describing a location, then we are a feature
     if gff_parts[3] and gff_parts[4]:
         #if quals.has_key('ID') or quals.has_key('Parent'):
         #    print gff_parts[1:6], quals
         location = FeatureLocation(
             int(gff_parts[3]) - 1, int(gff_parts[4]))
         new_feature = SeqFeature(location,
                                  gff_parts[2],
                                  id=quals.get('ID', [''])[0],
                                  strand=self._strand_map[gff_parts[6]])
         new_feature.qualifiers = quals
         # Handle flat features
         if not new_feature.id:
             rec.features.append(new_feature)
         # features that have parents need to link so we can pick up
         # the relationship
         elif new_feature.qualifiers.has_key('Parent'):
             for parent in new_feature.qualifiers['Parent']:
                 children[parent].append(new_feature)
         # top level features
         else:
             parents[rec.id].append(new_feature)
     # otherwise, associate these annotations with the full record
     else:
         # add these as a list of annotations, checking not to overwrite
         # current values
         for key, vals in quals:
             if rec.annotations.has_key(key):
                 try:
                     rec.annotations[key].extend(vals)
                 except AttributeError:
                     rec.annotations[key] = [rec.annotations[key]] + vals
             else:
                 rec.annotations[key] = vals
     return rec, parents, children
Esempio n. 17
0
def attach_features(predictions, seqrecord):
    for prediction in predictions[seqrecord.id]:
        if prediction.raw_score >= 1.0:
            qualifiers = {}
            qualifiers['locus_tag'] = [prediction.cds_id]
            feature = SeqFeature(
                location=prediction.location,
                type='CDS',
                strand=prediction.strand,
                qualifiers=qualifiers,
            )
            feature.qualifiers = qualifiers
            seqrecord.features.append(feature)
Esempio n. 18
0
def attach_features(predictions, seqrecord):
    for prediction in predictions[seqrecord.id]:
        if prediction.raw_score >= 1.0:
            qualifiers = {}
            qualifiers['locus_tag'] = [prediction.cds_id]
            feature = SeqFeature(
                location=prediction.location,
                type='CDS',
                strand=prediction.strand,
                qualifiers=qualifiers,
            )
            feature.qualifiers = qualifiers
            seqrecord.features.append(feature)
Esempio n. 19
0
 def _add_gff_line(self, rec, gff_parts, parents, children):
     """Add details from a GFF line to the given SeqRecord.
     """
     gff_parts = [(None if p == '.' else p) for p in gff_parts]
     assert rec.id == gff_parts[0], "ID mismatch: %s %s" % (rec.id,
             gff_parts[0])
     # collect all of the base qualifiers for this item
     quals = collections.defaultdict(list)
     if gff_parts[1]:
         quals["source"].append(gff_parts[1])
     if gff_parts[5]:
         quals["score"].append(gff_parts[5])
     if gff_parts[7]:
         quals["phase"].append(gff_parts[7])
     for key, val in [a.split('=') for a in gff_parts[8].split(';')]:
         quals[key].extend(val.split(','))
     quals = dict(quals)
     # if we are describing a location, then we are a feature
     if gff_parts[3] and gff_parts[4]:
         #if quals.has_key('ID') or quals.has_key('Parent'):
         #    print gff_parts[1:6], quals
         location = FeatureLocation(int(gff_parts[3]) - 1, int(gff_parts[4]))
         new_feature = SeqFeature(location, gff_parts[2],
                 id = quals.get('ID', [''])[0],
                 strand = self._strand_map[gff_parts[6]])
         new_feature.qualifiers = quals
         # Handle flat features
         if not new_feature.id:
             rec.features.append(new_feature)
         # features that have parents need to link so we can pick up
         # the relationship
         elif new_feature.qualifiers.has_key('Parent'):
             for parent in new_feature.qualifiers['Parent']:
                 children[parent].append(new_feature)
         # top level features
         else:
             parents[rec.id].append(new_feature)
     # otherwise, associate these annotations with the full record
     else:
         # add these as a list of annotations, checking not to overwrite
         # current values
         for key, vals in quals:
             if rec.annotations.has_key(key):
                 try:
                     rec.annotations[key].extend(vals)
                 except AttributeError:
                     rec.annotations[key] = [rec.annotations[key]] + vals
             else:
                 rec.annotations[key] = vals
     return rec, parents, children
Esempio n. 20
0
def nrpsSmash(dnaSeq):
    options = Namespace()
    options.outputfoldername = "/tmp/nrpspks_predictions_txt"
    options.record_idx = "" # used in NRPSPredictor2.nrpscodepred, check later what to set it to
    options.eukaryotic = 0
    tstFeature = SeqFeature(FeatureLocation(0, len(dnaSeq)), type="CDS", strand=1)
    tstFeature.qualifiers = {'gene':['gene']}
    sequenceRecord = SeqRecord(Seq(dnaSeq, IUPAC.unambiguous_dna),
        id = "seq_id",
        name = "seq_name",
        description = "seq_description")
    sequenceRecord.features = [tstFeature]
    analysis = specific_analysis(sequenceRecord, options)
    shutil.rmtree(options.raw_predictions_outputfolder)
    return analysis
Esempio n. 21
0
def check_sub(feature, sequence):
    new_features = []
    loc_list = []
    qual_list = {}
    topop = []
    for sub in feature.sub_features:
        if sub.sub_features:  # If there are sub_features, go deeper
            new_features.extend(check_sub(sub, sequence))
        elif sub.type == 'CDS':
            loc = [sub.location.start.real, sub.location.end.real]
            loc_list.append(FeatureLocation(loc[0], loc[1], strand=sub.strand))
            # For split features (CDSs), the final feature will have the same qualifiers as the children ONLY if
            # they're the same, i.e.: all children have the same "protein_ID" (key and value).
            for qual in sub.qualifiers.keys():
                if qual not in qual_list:
                    qual_list[qual] = sub.qualifiers[qual]
                if qual in qual_list and not qual_list[qual] == sub.qualifiers[
                        qual]:
                    topop.append(qual)

    for n in topop:  # Pop mismatching qualifers over split features
        qual_list.pop(n, None)
    qual_list.pop('Parent', None)  # Pop parent.

    # Only works in tip of the tree, when there's no new_feature built yet. If there is,
    # it means the script just came out of a check_sub and it's ready to return.
    if not new_features:
        if len(loc_list) > 1:
            loc_list = sorted(loc_list, key=lambda x: x.start.real)
            if loc_list[0].strand == 1:
                new_loc = CompoundLocation(loc_list)
            else:
                new_loc = CompoundLocation(list(reversed(loc_list)))
        elif len(loc_list) == 0:
            return new_features
        else:
            new_loc = loc_list[0]

        new_feature = SeqFeature(new_loc)
        new_feature.qualifiers = qual_list
        new_feature.type = 'CDS'
        trans = new_feature.extract(sequence.seq).translate(stop_symbol='')
        new_feature.qualifiers['translation'] = [str(trans)]
        new_features.append(new_feature)

    return new_features
Esempio n. 22
0
def nrpsSmash(dnaSeq):
    options = Namespace()
    options.outputfoldername = "/tmp/nrpspks_predictions_txt"
    options.record_idx = ""  # used in NRPSPredictor2.nrpscodepred, check later what to set it to
    options.eukaryotic = 0
    tstFeature = SeqFeature(FeatureLocation(0, len(dnaSeq)),
                            type="CDS",
                            strand=1)
    tstFeature.qualifiers = {'gene': ['gene']}
    sequenceRecord = SeqRecord(Seq(dnaSeq, IUPAC.unambiguous_dna),
                               id="sauce",
                               name="bbqSauce",
                               description="wtfDude")
    sequenceRecord.features = [tstFeature]
    analysis = specific_analysis(sequenceRecord, options)
    shutil.rmtree(options.raw_predictions_outputfolder)
    return analysis
Esempio n. 23
0
def parse_smart_domains(file_name):
    in_handle = open(file_name, "rU")
    lines = in_handle.readlines()
    domains = []
    domain_status = False
    domain_type = False
    is_domain = False
    for line in lines:
        line = line.rstrip()
        if len(line) > 1:
            pairs = line.split("=")
            is_domain = True
            if len(pairs) == 2:
                if pairs[0] == "DOMAIN":
                    domain_name = pairs[1]
                elif pairs[0] == "START":
                    domain_start = int(pairs[1])
                elif pairs[0] == "END":
                    domain_end = int(pairs[1])
                elif pairs[0] == "TYPE":
                    if pairs[1] != "PFAM":
                        domain_type = True
                    else:
                        domain_type = False
                elif pairs[0] == "STATUS":
                    if pairs[1] == "visible|OK":
                        domain_status = True
                    #False
                    else: domain_status = True
                else:
                    is_domain = False
        else:
            if is_domain & domain_type & domain_status:
                d = SeqFeature(FeatureLocation(domain_start, domain_end), type="Region")
                d.qualifiers = {'region_name': [domain_name]}
                if domain_name != 'low_complexity_region':
                    domains.append(d)
                is_domain = False
                domain_type = False
                domain_status = False
    in_handle.close()
    return domains
Esempio n. 24
0
def store_promoters(promoters: Iterable[Promoter], record: Record) -> None:
    """Store information about promoter sequences to a SeqRecord"""
    for promoter in promoters:
        # remember to account for 0-indexed start location
        new_feature = SeqFeature(FeatureLocation(max(0, promoter.start - 1),
                                                 promoter.end),
                                 type="promoter")
        new_feature.qualifiers = {
            "locus_tag": promoter.get_gene_names(
            ),  # already a list with one or two elements
            "seq": [str(promoter.seq)],
        }

        if isinstance(promoter, CombinedPromoter):
            new_feature.qualifiers["note"] = ["bidirectional promoter"]

        secmet_version = Feature.from_biopython(new_feature)
        secmet_version.created_by_antismash = True

        record.add_feature(secmet_version)
Esempio n. 25
0
def add_cds_feature(
    transcript: TranscriptInterval,
    transcript_qualifiers: Dict[Hashable, List[Hashable]],
    strand: Strand,
    translation_table: TranslationTable,
    update_translations: bool,
) -> SeqFeature:
    """
    Converts a :class:`~biocantor.gene.transcript.TranscriptInterval` that has a CDS to a
    :class:`Bio.SeqFeature.SeqFeature`. that represents the spliced CDS interval.

    Args:
        transcript: A :class:`~biocantor.gene.transcript.TranscriptInterval`.
        strand: ``Strand`` that this transcript lives on.
        transcript_qualifiers: Qualifiers dictionary from the transcript level feature.
        translation_table: Translation table to use.
        update_translations: Should the /translation tag be calculated or re-calculated?
            This is a time consuming process.

    Returns:
        ``SeqFeature`` for the CDS of this transcript.
    """
    location = transcript.cds._location.to_biopython()
    feature = SeqFeature(location,
                         type=GeneIntervalFeatures.CDS.value,
                         strand=strand.value)
    feature.qualifiers = transcript_qualifiers

    if update_translations:
        # if the sequence has N's, we cannot translate
        try:
            feature.qualifiers["translation"] = [
                str(
                    transcript.get_protein_sequence(
                        translation_table=translation_table))
            ]
        except ValueError:
            pass

    return feature
Esempio n. 26
0
def parse_prodigal(infile, start_nr=1, name_base='prodigal_', prefix=''):
    feature_dict = {}
    all_names = {}
    with open(infile) as f:
        text = f.read()

    tabs = text.split('\n#')
    groups = [(tabs[i], tabs[i + 1]) for i in range(0, len(tabs), 2)]
    for header, data in groups:
        features = []
        name_start = header.index('seqhdr=')
        scaffold_name = header[name_start + 7:].strip('"')
        if ' ' in scaffold_name:
            scaffold_name = scaffold_name.split(' ')[0]
        data_lines = data.split('\n')
        if '' in data_lines:
            data_lines.remove('')
        for line in data_lines[1:]:
            line_data = line[1:].split('_')
            gene_nr = line_data[0]
            start = int(line_data[1]) - 1  # To adjust to pythonic index
            end = int(line_data[2])
            if line_data[3] == '+':
                strand = 1
            else:
                strand = -1
            gene_name = '%s%s%i' % (prefix, name_base, start_nr)
            if gene_name not in all_names:
                all_names[gene_name] = 0
            all_names[gene_name] += 1
            start_nr += 1
            quals = {'locus_tag': [gene_name]}
            # Convert to SeqFeature
            fl = FeatureLocation(start, end, strand)
            sf = SeqFeature(fl, type='CDS', strand=strand)
            sf.qualifiers = quals
            features.append(sf)
        feature_dict[scaffold_name] = features

    return (feature_dict, start_nr, all_names)
Esempio n. 27
0
def merge_cds(LocList, TransLocList, QualList, sequence):
    if len(LocList) > 1:
        LocList = sorted(LocList, key=lambda x: x.start.real)
        TransLocList = sorted(TransLocList, key=lambda x: x.start.real)
        if LocList[0].strand == 1:
            newLoc = CompoundLocation(LocList)
        else:
            newLoc = CompoundLocation(list(reversed(LocList)))
            TransLocList = reversed(TransLocList)
    elif len(LocList) == 0:
        return None
    else:
        newLoc = LocList[0]

    cur_feature = SeqFeature(newLoc)
    cur_feature.qualifiers = QualList
    cur_feature.type = 'CDS'
    trans = ''.join([
        n.extract(sequence.seq).translate(stop_symbol='')._data
        for n in TransLocList
    ])
    cur_feature.qualifiers['translation'] = [trans]

    return cur_feature
Esempio n. 28
0
def pred_coil(seqr, params, fScore=None):
    ''' pred_coil(seq,seqLen,params,fScore) returns the coiled coil prediction of sequence seq'''
    seqr = copy.deepcopy(seqr)
    if fScore == None:
        fScore = seqScore
    seq = seqr.seq
    seqLen = len(seqr.seq)
    hept_pos=['a','b','c','d','e','f','g']
    score=[0.0]*seqLen
    hept_seq=['x']*seqLen
    for i in range(seqLen-params.win+1):
        this_score=1.0
        actual_win=0.0
        for j in range(min(params.win,seqLen-i)):
            pos=j%7
            actual_win+=params.pow[pos]
            this_score*=math.pow( fScore(params.mat,seq,i+j,pos), params.pow[pos] )
        if actual_win > 0:
            this_score=math.pow(this_score,1/actual_win)
        else:
            this_score=0.0
        for j in range(min(params.win,seqLen-i)): 
            pos=j%7
            if this_score > score[i+j]:
                score[i+j]=this_score
                hept_seq[i+j]=hept_pos[pos]
    for i in range(seqLen):
        gg, gcc, prob = coilProb(score[i],params)
        seqf = SeqFeature(location=FeatureLocation(i,i), type="pscoils")
        seqf.qualifiers = {'gg':gg, 
            'gcc': gcc, 
            'prob': prob, 
            'score': score[i],
            'hept_seq': hept_seq[i]}
        seqr.features.append(seqf)
    return seqr  
Esempio n. 29
0
def gene2features(r, gene, gene2position, gene2product, start, end, gcode,
                  partialyes, verbose):
    """
    """
    contig, CDSs, gffstrand, function, frames = gene2position[gene]
    if gffstrand in ('1', '+'):
        strand = +1
    else:
        strand = -1
        CDSs.reverse()
    '''#add stop codon if not partial seq
    if strand==1 and CDSs[-1][1]+3 <= len(r.seq):
            CDSs[-1][1] += 3
    elif strand==-1 and CDSs[0][0]-3 > 0:
        CDSs[0][0] -= 3'''
    cdsloc, mrnaloc = get_locations(CDSs, start, end, strand)
    #add gene
    geneid = gene  #".".join(gene.split('.')[:-1])
    #get product
    product = "hypothetical protein"
    if geneid in gene2product:
        product = gene2product[geneid]
    if gene.endswith('.t1'):
        sf = SeqFeature(FeatureLocation(BeforePosition(start - 1),
                                        AfterPosition(end)),
                        strand=strand,
                        type='gene',
                        id=geneid)
        sf.qualifiers = {
            "locus_tag": geneid,
            "gene": geneid,
            "product": product
        }
        r.features.append(sf)
    #get mRNA sf
    sf = SeqFeature(mrnaloc, type='mRNA', id=gene)
    sf.qualifiers = {
        "locus_tag": geneid,
        "gene": geneid,
        "product": product
    }  #"protein_id": gene
    r.features.append(sf)
    #get CDS sf
    sf = SeqFeature(cdsloc, type='CDS', id=gene)
    #get translation
    seq = sf.extract(r.seq)
    aa = str(seq.translate(table=gcode))
    #solve non-triplets issue
    if len(seq) % 3:
        if strand == 1:
            end -= len(seq) % 3
        else:
            start += len(seq) % 3
    ##check for partial sequence - no M as first or no * as last aa
    partial = 0
    #both ends partial
    if aa[0] != "M" and aa[-1] != "*":
        partial = 1
        sf.location = FeatureLocation(BeforePosition(start - 1),
                                      AfterPosition(end))
    #left end partial
    elif aa[0] != "M" and strand == 1 or aa[-1] != "*" and strand == -1:
        partial = 1
        sf.location = FeatureLocation(BeforePosition(start - 1), end)
    #right end partial
    elif aa[-1] != "*" and strand == 1 or aa[0] != "M" and strand == -1:
        partial = 1
        sf.location = FeatureLocation(start - 1, AfterPosition(end))
    #strip stop codon
    aa = aa.strip("*")
    #replace internal stop codons by X
    if "*" in aa:
        if verbose:
            sys.stderr.write("[Warning] Stop codon(s) in: %s. Skipped!\n" %
                             gene)
        return r
        #aa = aa.replace("*","X")
    sf.qualifiers = {
        'transl_table': gcode,
        "locus_tag": geneid,
        "gene": geneid,
        "product": product,
        "translation": aa
    }  #"protein_id": gene,
    if function:
        sf.qualifiers['note'] = function
    #inform about partial entries
    if partial:
        #skip if not partial are allowed
        if not partialyes:
            return r
        if aa[0] != "M":
            sf.qualifiers['codon_start'] = 1
        sf.qualifiers['product'] += ", partial cds"
        if verbose:
            sys.stderr.write("[Warning] Partial sequence: %s\n" % (gene, ))
            #sys.stderr.write("[Warning] Partial sequence: %s %s\n" % (gene,sf))
    #add to features
    r.features.append(sf)
    return r
Esempio n. 30
0
def gene_to_feature(
    gene_or_feature: Union[GeneInterval, FeatureIntervalCollection],
    genbank_type: GenbankFlavor,
    force_strand: bool,
    translation_table: TranslationTable,
    update_translations: bool,
) -> Iterable[SeqFeature]:
    """Converts either a :class:`~biocantor.gene.collections.GeneInterval` or a
    :class:`~biocantor.gene.collections.FeatureIntervalCollection` to a :class:`Bio.SeqFeature.SeqFeature`.

    :class:`Bio.SeqFeature.SeqFeature` are BioPython objects that will then be used to write to a GenBank file. There
    is one :class:`Bio.SeqFeature.SeqFeature` for every feature, or row group, in the output file. There will be one
    contiguous interval at the Gene level.

    While :class:`~biocantor.gene.collections.GeneInterval` always has its interval on the plus strand,
    GenBank files assume that a Gene has an explicit strand. Therefore, this function picks the most common strand
    and forces it on all of its children.

    Args:
        gene_or_feature: A :class:`~biocantor.gene.collections.GeneInterval` or
            :class:`~biocantor.gene.collections.FeatureIntervalCollection`.
        genbank_type: Are we writing an prokaryotic or eukaryotic style GenBank file?
        force_strand: Boolean flag; if ``True``, then strand on children is forced, if ``False``, then improper
            strands are instead skipped.
        translation_table: Translation table to use.
        update_translations: Should the /translation tag be calculated or re-calculated?
            This is a time consuming process.

    Yields:
        ``SeqFeature``s, one for the gene, one for each child transcript, and one for each transcript's CDS if it
            exists.
    """
    location = gene_or_feature._location.to_biopython()
    # update the strand by picking the most common
    strands = [child.strand for child in gene_or_feature]
    strand = max(strands, key=strands.count)

    qualifiers = {
        key: list(vals)
        for key, vals in gene_or_feature.export_qualifiers().items()
    }

    # do our best to ensure there is a /gene tag
    symbol = None
    if isinstance(gene_or_feature, GeneInterval):
        if gene_or_feature.gene_symbol:
            symbol = gene_or_feature.gene_symbol
        elif gene_or_feature.gene_id:
            symbol = gene_or_feature.gene_id

        feature_type = GeneFeatures.GENE.value

    else:
        if gene_or_feature.feature_collection_name:
            symbol = gene_or_feature.feature_collection_name
        elif gene_or_feature.feature_collection_id:
            symbol = gene_or_feature.feature_collection_id

        feature_type = FeatureCollectionFeatures.FEATURE_COLLECTION.value

    if symbol:
        qualifiers[feature_type] = [symbol]
    if gene_or_feature.locus_tag:
        qualifiers[KnownQualifiers.LOCUS_TAG.value] = gene_or_feature.locus_tag

    feature = SeqFeature(location, type=feature_type, strand=strand.value)
    feature.qualifiers = qualifiers

    yield feature

    if isinstance(gene_or_feature, GeneInterval):
        yield from transcripts_to_feature(
            gene_or_feature.transcripts,
            strand,
            genbank_type,
            force_strand,
            translation_table,
            symbol,
            gene_or_feature.locus_tag,
            update_translations,
        )
    else:
        yield from feature_intervals_to_features(
            gene_or_feature.feature_intervals, strand, force_strand, symbol,
            gene_or_feature.locus_tag)
Esempio n. 31
0
def transcripts_to_feature(
    transcripts: List[TranscriptInterval],
    strand: Strand,
    genbank_type: GenbankFlavor,
    force_strand: bool,
    translation_table: TranslationTable,
    gene_symbol: Optional[str] = None,
    locus_tag: Optional[str] = None,
    update_translations: bool = False,
) -> Iterable[SeqFeature]:
    """Converts a :class:`~biocantor.gene.transcripts.TranscriptInterval` to a :class:`Bio.SeqFeature.SeqFeature`.

    :class:`Bio.SeqFeature.SeqFeature` are BioPython objects that will then be used to write to a GenBank file. There
    is one :class:`Bio.SeqFeature.SeqFeature` for every feature, or row group, in the output file. There will be one
    joined interval at the transcript level representing the exonic structure.

    While transcript members of a gene can have different strands, for GenBank files that is not allowed. This function
    will explicitly force the strand and provide a warning that this is happening.

    In eukaryotic mode, this function will create mRNA features for coding genes, and biotype features for non-coding.
    Coding genes are then passed on to create CDS features.

    In prokaryotic mode, this function will only create biotype features for non-coding genes.

    Args:
        transcripts: A list of :class:`~biocantor.gene.transcript.TranscriptInterval`.
        strand: ``Strand`` that this gene lives on.
        genbank_type: Are we writing an prokaryotic or eukaryotic style GenBank file?
        force_strand: Boolean flag; if ``True``, then strand is forced, if ``False``, then improper strands are instead
            skipped.
        gene_symbol: An optional gene symbol.
        locus_tag: An optional locus tag.
        translation_table: Translation table to use.
        update_translations: Should the /translation tag be calculated or re-calculated?
            This is a time consuming process.

    Yields:
        ``SeqFeature``s, one for each transcript and then one for each CDS of the transcript, if it exists.
    """
    for transcript in transcripts:
        location = transcript.chunk_relative_location.to_biopython()

        transcript_qualifiers = {
            key: list(vals)
            for key, vals in transcript.export_qualifiers().items()
        }
        if gene_symbol is not None:
            transcript_qualifiers[KnownQualifiers.GENE.value] = [gene_symbol]
        if locus_tag is not None:
            transcript_qualifiers[KnownQualifiers.LOCUS_TAG.value] = [
                locus_tag
            ]

        if location.strand != strand.value:
            warn_str = f"Found strand mismatch between gene and transcript on transcript {transcript}. "
            if force_strand:
                warn_str += "Forcing this transcript to the gene orientation."
                warnings.warn(warn_str, StrandViolationWarning)
            else:
                warn_str += "Skipping this transcript."
                warnings.warn(warn_str, StrandViolationWarning)
                continue

        if transcript.transcript_type is not None and TranscriptFeatures.has_value(
                transcript.transcript_type.name):
            feat_type = TranscriptFeatures(transcript.transcript_type.name)
        # biotypes might be wrong, only trust the CDS interval
        elif transcript.is_coding:
            feat_type = TranscriptFeatures.CODING_TRANSCRIPT
        else:
            feat_type = TranscriptFeatures.MISC_RNA

        if feat_type == TranscriptFeatures.CODING_TRANSCRIPT and genbank_type == GenbankFlavor.PROKARYOTIC:
            # this is a coding gene in prokaryotic mode; skip straight to CDS
            yield add_cds_feature(transcript, transcript_qualifiers, strand,
                                  translation_table, update_translations)
        else:
            # build this feature; it could be a mRNA for eukaryotic, or non-coding for either prokaryotic or eukaryotic
            feature = SeqFeature(location,
                                 type=feat_type.value,
                                 strand=strand.value)
            feature.qualifiers = transcript_qualifiers.copy()

            # NCBI does not like protein_id on transcript level features
            if "protein_id" in feature.qualifiers:
                del feature.qualifiers["protein_id"]

            yield feature
            # only in eukaryotic mode for coding genes do we add a third layer
            if genbank_type == GenbankFlavor.EUKARYOTIC and feat_type == TranscriptFeatures.CODING_TRANSCRIPT:
                yield add_cds_feature(transcript, transcript_qualifiers,
                                      strand, translation_table,
                                      update_translations)
Esempio n. 32
0
                        right_remains_end=currentsearch[2]
                        
                        rightremains_list=[rightremainseq,right_remains_start,right_remains_end]
                    
                        remainsearches.append([rightremainseq,right_remains_start,right_remains_end])


        featureslist=[]
        for feat in gbhitslist:
	    #print feat
            mystrand=feat[3]["frame"][0]*feat[3]["frame"][1]
            
            feature = SeqFeature(FeatureLocation(feat[0]-1,feat[1]), strand=mystrand,type=feat[2])


            feature.qualifiers=feat[3]
            featureslist.append(feature)

	for f in featureslist:
		gbfile.features.append(f)

        outfilepath=contig_dir_path
        #print(outfilepath)
        tempfh=open(outfilepath,"w")
        SeqIO.write([gbfile],tempfh,"genbank")
    #print "treated "+str(contigcounter)+ " contigs"


print "treated "+str(genomerowcounter)+ "genome rows"

Esempio n. 33
0
        if r0063 is not None:
            luxpl = next(get_features('Lux pL promoter'))
            luxpl.location = r0063.location
            gb_archive.features.remove(r0063)

        # add LVA ssrA tag
        ssra_match = SSRA_TAG.search(gb_archive.seq)
        if ssra_match is not None:
            ssra = SeqFeature(type="CDS")
            ssra.location = FeatureLocation(*ssra_match.span(), strand=1)
            ssra.qualifiers = {
                "label": ["ssrA tag (LVA)"],
                "product": [
                    "C-terminal peptide that mediates degradation in bacteria through the ClpXP and ClpAP proteases (McGinness et al., 2006)"
                ],
                "translation":
                "AANDENYALVA",
                "note": [
                    "mutant LVA variant that confers accelerated degradation under some conditions (Andersen et al., 1998)",
                    "color: #cc99b2",
                ],
            }
            gb_archive.features.append(ssra)

        # Replace E0040m with well annotated GFP
        e0040m = next(get_features("E0040m"), None)
        if e0040m is not None:
            if any(get_features("GFP")):
                gb_archive.features.remove(next(get_features("GFP")))
            e0040m.qualifiers.update(gfp.qualifiers)

        # Replace E1010m with well annotated mRFP
Esempio n. 34
0
def gene2features(r, gene, gene2position, gene2product, start, end, gcode, partialyes, verbose):
    """
    """
    contig, CDSs, gffstrand, function, frames = gene2position[gene]
    if gffstrand in ('1','+'):
        strand = +1
    else:
        strand = -1
        CDSs.reverse()
    '''#add stop codon if not partial seq
    if strand==1 and CDSs[-1][1]+3 <= len(r.seq):
            CDSs[-1][1] += 3
    elif strand==-1 and CDSs[0][0]-3 > 0:
        CDSs[0][0] -= 3'''
    cdsloc, mrnaloc = get_locations(CDSs, start, end, strand)
    #add gene
    geneid = gene #".".join(gene.split('.')[:-1])
    #get product
    product = "hypothetical protein"
    if geneid in gene2product:
        product = gene2product[geneid]    
    if gene.endswith('.t1'):
        sf = SeqFeature(FeatureLocation(BeforePosition(start-1),AfterPosition(end)), strand=strand, type='gene', id=geneid)
        sf.qualifiers={"locus_tag": geneid, "gene": geneid, "product": product}
        r.features.append(sf)
    #get mRNA sf
    sf = SeqFeature(mrnaloc, type='mRNA', id=gene)
    sf.qualifiers={"locus_tag": geneid, "gene": geneid, "product": product} #"protein_id": gene
    r.features.append(sf)
    #get CDS sf
    sf = SeqFeature(cdsloc, type='CDS', id=gene)
    #get translation
    seq = sf.extract(r.seq)
    aa = str(seq.translate(table=gcode))
    #solve non-triplets issue
    if len(seq) % 3:
        if strand==1:
            end   -= len(seq) % 3
        else:
            start += len(seq) % 3
    ##check for partial sequence - no M as first or no * as last aa
    partial = 0
    #both ends partial
    if aa[0]!="M" and aa[-1]!="*":
        partial = 1
        sf.location = FeatureLocation(BeforePosition(start-1),AfterPosition(end))
    #left end partial
    elif aa[0]!="M" and strand==1 or aa[-1]!="*" and strand==-1:
        partial = 1                
        sf.location = FeatureLocation(BeforePosition(start-1),end)
    #right end partial
    elif aa[-1]!="*" and strand==1 or aa[0]!="M" and strand==-1:
        partial = 1
        sf.location = FeatureLocation(start-1,AfterPosition(end))
    #strip stop codon
    aa = aa.strip("*")
    #replace internal stop codons by X
    if "*" in aa:
        if verbose:
            sys.stderr.write("[Warning] Stop codon(s) in: %s. Skipped!\n" % gene)
        return r
        #aa = aa.replace("*","X")
    sf.qualifiers = {'transl_table': gcode, "locus_tag": geneid, "gene": geneid, "product": product, "translation": aa} #"protein_id": gene,
    if function:
        sf.qualifiers['note'] = function
    #inform about partial entries
    if partial:
        #skip if not partial are allowed
        if not partialyes:
            return r
        if aa[0]!="M":
            sf.qualifiers['codon_start'] = 1
        sf.qualifiers['product']    += ", partial cds"
        if verbose:
            sys.stderr.write("[Warning] Partial sequence: %s\n" % (gene,))
            #sys.stderr.write("[Warning] Partial sequence: %s %s\n" % (gene,sf))
    #add to features
    r.features.append(sf)
    return r 
Esempio n. 35
0
def annotate_geneclusters(seq_record, options):
    """Re-annotate gene clusters in the seq_record"""
    pfam_features = utils.get_pfam_features(seq_record)
    cf_clusters = find_cf_clusters(pfam_features, seq_record, options)
    # Integrate ClusterFinder clusters with existing cluster features
    newclusters = []
    cluster_features = utils.get_cluster_features(seq_record)
    secmet_cds_features = utils.get_secmet_cds_features(seq_record)

    for cf_cluster in cf_clusters:
        overlaps = False
        cf_type = "cf_putative"
        for cluster in cluster_features:
            if not utils.features_overlap(cf_cluster, cluster):
                continue

            overlaps = True

            # Get signature genes from antiSMASH-predicted cluster
            features_in_cluster = utils.get_cluster_cds_features(
                cluster, seq_record)
            cluster_sig_genes = [
                gene for gene in secmet_cds_features
                if gene in features_in_cluster
            ]

            # Predict gene cluster borders using ClusterFinder
            if options.borderpredict:
                if ((cluster.location.end + cluster.location.start) /
                        2) in cf_cluster.location:
                    # Make sure that antiSMASH signature genes are still included in the cluster
                    for sig_gene in cluster_sig_genes:
                        startpoint = min(
                            [sig_gene.location.start, sig_gene.location.end])
                        endpoint = max(
                            [sig_gene.location.start, sig_gene.location.end])
                        if cf_cluster.location.start > startpoint:
                            cf_cluster.location = FeatureLocation(
                                startpoint, cf_cluster.location.end)
                        if cf_cluster.location.end < endpoint:
                            cf_cluster.location = FeatureLocation(
                                cf_cluster.location.start, endpoint)
                    cluster_border = SeqFeature(cf_cluster.location,
                                                type="cluster_border")
                    cluster_border.qualifiers = {
                        "tool": ["clusterfinder"],
                        "probability": [cf_cluster.probability],
                        "note": ["best prediction"],
                    }
                    seq_record.features.append(cluster_border)
            elif cf_cluster.location.start < cluster.location.start and cf_cluster.location.end > cluster.location.end:
                cluster.location = cf_cluster.location
            elif cf_cluster.location.start < cluster.location.start:
                cluster.location = FeatureLocation(cf_cluster.location.start,
                                                   cluster.location.end)
            elif cf_cluster.location.end > cluster.location.end:
                cluster.location = FeatureLocation(cluster.location.start,
                                                   cf_cluster.location.end)
            cluster.qualifiers['probability'] = [
                "%01.4f" % cf_cluster.probability
            ]
        if not overlaps and not ('borderpredict_only' in options
                                 and options.borderpredict_only):
            cf_cluster_CDSs = utils.get_cluster_cds_features(
                cf_cluster, seq_record)
            for CDS in cf_cluster_CDSs:
                if 'sec_met' in CDS.qualifiers:
                    type_sec_met_qualifiers = [
                        feat for feat in CDS.qualifiers['sec_met']
                        if "Type: " in feat
                    ]
                    for qualifier in type_sec_met_qualifiers:
                        if "cf_fatty_acid" in qualifier:
                            if cf_type == "cf_putative":
                                cf_type = "cf_fatty_acid"
                            elif cf_type == "cf_saccharide":
                                cf_type = "cf_fatty_acid-saccharide"
                        if "cf_saccharide" in qualifier:
                            if cf_type == "cf_putative":
                                cf_type = "cf_saccharide"
                            elif cf_type == "cf_fatty_acid":
                                cf_type = "cf_fatty_acid-saccharide"
            new_cluster = SeqFeature(cf_cluster.location, type="cluster")
            new_cluster.qualifiers['product'] = [cf_type]
            new_cluster.qualifiers['probability'] = [
                "%01.4f" % cf_cluster.probability
            ]
            newclusters.append(new_cluster)

    if len(newclusters):
        seq_record.features.extend(newclusters)
        renumber_clusters(seq_record, options)
Esempio n. 36
0
def name_nrpspks(seq_record, pksnrpsvars, withinclustergenes, options):
    pksnrpsvars.nrpspkstypedict = {}
    for feature in withinclustergenes:
        k = utils.get_gene_id(feature)
        if not pksnrpsvars.domaindict.has_key(k):
            continue
        if pksnrpsvars.domaindict[k] == []:
            continue
        #structure of domaindict: domaindict[genename] = [[name,start,end,evalue,score],[name,start,end,evalue,score], etc.]
        domainlist = []
        nrKSdomains = 0
        for i in pksnrpsvars.domaindict[k]:
            domainlist.append(i[0])
            if i[0] == "PKS_KS":
                nrKSdomains += 1
        modKSscore = 0
        traKSscore = 0
        eneKSscore = 0
        iterKSscore = 0
        if pksnrpsvars.ksdomaindict.has_key(k):
            for i in pksnrpsvars.ksdomaindict[k]:
                if i[0] == "Trans-AT-KS":
                    traKSscore += 1
                if i[0] == "Modular-KS":
                    modKSscore += 1
                if i[0] == "Enediyne-KS":
                    eneKSscore += 1
                if i[0] == "Iterative-KS":
                    iterKSscore += 1
        if pksnrpsvars.domaindict.has_key(k):
            for i in pksnrpsvars.domaindict[k]:
                if "Cglyc" in domainlist and "Epimerization" in domainlist and "AMP-binding" in domainlist and "PKS_KS" not in domainlist and "PKS_AT" not in domainlist:
                    nrpspkstype = "Glycopeptide NRPS"
                elif (
                        "Condensation_LCL" in domainlist or "Condensation_DCL"
                        in domainlist or "Condensation_Starter" in domainlist
                        or "Cglyc" in domainlist
                        or "Condensation_Dual" in domainlist
                ) and "AMP-binding" in domainlist and "PKS_KS" not in domainlist and "PKS_AT" not in domainlist:
                    nrpspkstype = "NRPS"
                elif ("Condensation_LCL" in domainlist or "Condensation_DCL"
                      in domainlist or "Condensation_Starter" in domainlist
                      or "Cglyc" in domainlist or "Condensation_Dual"
                      in domainlist) or "AMP-binding" in domainlist and (
                          "PKS_KS" in domainlist or "PKS_AT" in domainlist):
                    nrpspkstype = "Hybrid PKS-NRPS"
                elif (
                        "Condensation_LCL" not in domainlist
                        and "Condensation_DCL" not in domainlist
                        and "Condensation_Starter" not in domainlist
                        and "Cglyc" not in domainlist and "Condensation_Dual"
                        not in domainlist and "AMP-binding" not in domainlist
                ) and "PKS_KS" in domainlist and "PKS_AT" not in domainlist and "Trans-AT_docking" in domainlist and traKSscore > modKSscore and traKSscore > iterKSscore and traKSscore > eneKSscore:
                    nrpspkstype = "Type I Trans-AT PKS"
                elif (
                        "Condensation_LCL" not in domainlist
                        and "Condensation_DCL" not in domainlist
                        and "Condensation_Starter" not in domainlist
                        and "Cglyc" not in domainlist and "Condensation_Dual"
                        not in domainlist and "AMP-binding" not in domainlist
                ) and "PKS_KS" in domainlist and "PKS_AT" in domainlist and iterKSscore > modKSscore and iterKSscore > traKSscore and iterKSscore > eneKSscore and nrKSdomains < 3:
                    nrpspkstype = "Type I Iterative PKS"
                elif (
                        "Condensation_LCL" not in domainlist
                        and "Condensation_DCL" not in domainlist
                        and "Condensation_Starter" not in domainlist
                        and "Cglyc" not in domainlist and "Condensation_Dual"
                        not in domainlist and "AMP-binding" not in domainlist
                ) and "PKS_KS" in domainlist and "PKS_AT" in domainlist and eneKSscore > modKSscore and eneKSscore > traKSscore and eneKSscore > iterKSscore and nrKSdomains < 3:
                    nrpspkstype = "Type I Enediyne PKS"
                elif (
                        "Condensation_LCL" not in domainlist
                        and "Condensation_DCL" not in domainlist
                        and "Condensation_Starter" not in domainlist
                        and "Cglyc" not in domainlist and "Condensation_Dual"
                        not in domainlist and "AMP-binding" not in domainlist
                ) and "PKS_KS" in domainlist and "PKS_AT" in domainlist and (
                    (modKSscore > eneKSscore and modKSscore > traKSscore
                     and modKSscore > iterKSscore) or nrKSdomains > 3):
                    nrpspkstype = "Type I Modular PKS"
                elif ("Condensation_LCL" not in domainlist
                      and "Condensation_DCL" not in domainlist
                      and "Condensation_Starter" not in domainlist
                      and "Cglyc" not in domainlist and "Condensation_Dual"
                      not in domainlist and "AMP-binding" not in domainlist
                      ) and "PKS_KS" in domainlist and "PKS_AT" in domainlist:
                    nrpspkstype = "PKS-like protein"
                elif (
                        "Condensation_LCL" in domainlist or "Condensation_DCL"
                        in domainlist or "Condensation_Starter" in domainlist
                        or "Cglyc" in domainlist or "Condensation_Dual"
                        in domainlist or "AMP-binding" in domainlist
                ) and "PKS_KS" not in domainlist and "PKS_AT" not in domainlist:
                    nrpspkstype = "NRPS-like protein"
                else:
                    nrpspkstype = "PKS/NRPS-like protein"
            if feature.qualifiers.has_key("sec_met"):
                feature.qualifiers['sec_met'].append("NRPS/PKS subtype: " +
                                                     nrpspkstype)
            else:
                feature.qualifiers['sec_met'] = [
                    "NRPS/PKS subtype: " + nrpspkstype
                ]

            #Write motifs to seq_record
            motifFeatures = []
            if pksnrpsvars.motifdict.has_key(k):
                motifs = pksnrpsvars.motifdict[k]
                counter = 1
                for motif in motifs:
                    if feature.location.strand == 1:
                        start = feature.location.start + (3 * motif[1])
                        end = feature.location.start + (3 * motif[2])
                    else:
                        end = feature.location.end - (3 * motif[1])
                        start = feature.location.end - (3 * motif[2])
                    loc = FeatureLocation(start, end, strand=feature.strand)
                    motifFeature = SeqFeature(
                        loc, type=options.FeatureTags.pksnrpsmotifs_tag)
                    quals = defaultdict(list)

                    quals['label'].append(str(motif[0]))
                    if feature.qualifiers.has_key('locus_tag'):
                        quals['locus_tag'] = feature.qualifiers['locus_tag']
                    else:
                        quals['locus_tag'] = [k]
                    quals['motif'] = [motif[0]]
                    quals['asDomain_id'] = [
                        'nrpspksmotif_' + '_'.join(quals['locus_tag']) + '_' +
                        '{:04d}'.format(counter)
                    ]
                    counter += 1

                    quals['evalue'] = [str("{:.2E}".format(float(motif[3])))]
                    quals['score'] = [str(motif[4])]
                    quals['aSTool'] = ["pksnrpsmotif"]
                    quals['detection'] = ["hmmscan"]
                    quals['database'] = ["abmotifs"]
                    if feature.qualifiers.has_key('transl_table'):
                        [transl_table] = feature.qualifiers['transl_table']
                    else:
                        transl_table = 1
                    quals['translation'] = [
                        str(
                            motifFeature.extract(seq_record).seq.translate(
                                table=transl_table))
                    ]

                    quals['note'].append("NRPS/PKS Motif: " + motif[0] +
                                         " (e-value: " + str(motif[3]) +
                                         ", bit-score: " + str(motif[4]) + ")")

                    motifFeature.qualifiers = quals

                    motifFeatures.append(motifFeature)
            nrpspksdomains = pksnrpsvars.domaindict[k]

            for domain in nrpspksdomains:
                if feature.qualifiers.has_key("sec_met"):
                    feature.qualifiers['sec_met'].append(
                        "NRPS/PKS Domain: %s (%s-%s). E-value: %s. Score: %s;"
                        % (domain[0], str(domain[1]), str(
                            domain[2]), str(domain[3]), str(domain[4])))
                else:
                    feature.qualifiers['sec_met'] = [
                        "NRPS/PKS Domain: %s (%s-%s). E-value: %s. Score: %s;"
                        % (domain[0], str(domain[1]), str(
                            domain[2]), str(domain[3]), str(domain[4]))
                    ]

        seq_record.features.extend(motifFeatures)
        pksnrpsvars.nrpspkstypedict[k] = nrpspkstype
Esempio n. 37
0
def _annotate(seq_record, options, results):
    "Annotate seq_record with CDS_motifs for the result"
    logging.debug("generating feature objects for PFAM hits")
    min_score = _min_score(options)
    max_evalue = _max_evalue(options)

    feature_by_id = utils.get_feature_dict(seq_record)
    
    for r in results:
        i = 1
        for hsp in r.hsps:
            if hsp.bitscore <= min_score or hsp.evalue >= max_evalue:
                continue

            if not feature_by_id.has_key(hsp.query_id):
                continue

            feature = feature_by_id[hsp.query_id]

            start, end = _calculate_start_end(feature, hsp)
            loc = FeatureLocation(start, end, strand=feature.strand)
            
            newFeature = SeqFeature(location=loc, type=options.FeatureTags.fullhmmer_tag)
            
            quals = defaultdict(list)
            
            quals['label'].append(r.id)
            if feature.qualifiers.has_key('locus_tag'):       
                quals['locus_tag'] = feature.qualifiers['locus_tag']
            else:
                quals['locus_tag'] = [hsp.query_id]
            quals['domain'] = [hsp.hit_id]
            quals['asDomain_id'] = ['fullhmmer_'+'_'.join(quals['locus_tag'])+'_'+'{:04d}'.format(i)]
            i += 1
            
            quals['evalue'] = [str("{:.2E}".format(float(hsp.evalue)))]
            quals['score'] = [str(hsp.bitscore)]
            quals['aSTool'] = ["fullhmmer"]
            quals['detection'] = ["hmmscan"]
            quals['database'] = [path.basename(r.target)]
            if feature.qualifiers.has_key('transl_table'):
                [transl_table] = feature.qualifiers['transl_table']
            else:
                transl_table = 1
            quals['translation'] = [str(newFeature.extract(seq_record.seq).translate(table=transl_table))]

            quals['note'].append("%s-Hit: %s. Score: %s. E-value: %s. Domain range: %s..%s." % \
                    (path.basename(r.target), hsp.hit_id, hsp.bitscore, hsp.evalue,
                     hsp.hit_start, hsp.hit_end))

            quals['description'] = [hsp.hit_description]

            try:
                pfamid = name_to_pfamid[hsp.hit_id]
                if quals.has_key('db_xref'):
                    quals['db_xref'].append("PFAM: %s" % pfamid)
                else:
                    quals['db_xref'] = ["PFAM: %s" % pfamid]    
            except KeyError:
                pass
            
            newFeature.qualifiers=quals
            seq_record.features.append(newFeature)
Esempio n. 38
0
def agrupar_sitios():
    regiones = list(
        GFF.parse("/data/organismos/ILEX_PARA2/regulation/ncbi_IP4.gff3.reg"))
    ids = 1
    groups = {}
    for c in tqdm(regiones):
        groups[c.id] = []
        for strand in [1, -1]:
            group = SeqFeature(id=c.features[0],
                               type="grouped_transcription_regulatory_region",
                               location=c.features[0].location)
            group.sub_features = []

            fs = sorted([f for f in c.features if f.strand == strand],
                        key=lambda x: x.location.start)
            if not fs:
                continue
            group.sub_features += [fs[0]]

            for f in fs[1:]:
                end = max([x.location.end for x in group.sub_features])
                if ((abs(f.location.start - end) < 1500)
                        or (set(range(f.location.start, f.location.end)) & set(
                            range(group.sub_features[-1].location.start,
                                  group.sub_features[-1].location.end)))):
                    group.sub_features.append(f)
                else:
                    group.qualifiers = {
                        "description":
                        "_".join(
                            sorted(
                                set([
                                    x.qualifiers["description"][0].split(
                                        " regulatory region")[0]
                                    for x in group.sub_features
                                ]))),
                        "ID": ["ILEXPARARR" + str(ids)]
                    }
                    ids += 1

                    group.location = FeatureLocation(
                        start=min(
                            [x.location.start for x in group.sub_features]),
                        end=max([x.location.end for x in group.sub_features]),
                        strand=f.location.strand)
                    assert group.location.start < group.location.end
                    if (group.location.end - group.location.start) > 5000:
                        print(group.qualifiers["ID"])

                    groups[c.id].append(group)
                    group = SeqFeature(
                        id=c.features[0],
                        type="grouped_transcription_regulatory_region",
                        location=f.location)
                    group.sub_features = [f]
            if group:
                group.qualifiers = {
                    "description":
                    "_".join(
                        sorted(
                            set([
                                x.qualifiers["description"][0].split(
                                    " binding site")[0]
                                for x in group.sub_features
                            ]))),
                    "ID": ["ILEXPARARR" + str(ids)]
                }
                ids += 1
                group.location = FeatureLocation(
                    start=min([x.location.start for x in group.sub_features]),
                    end=max([x.location.end for x in group.sub_features]),
                    strand=f.location.strand)
                assert group.location.start < group.location.end
                if (group.location.end - group.location.start) > 5000:
                    print(group.qualifiers["ID"])

                groups[c.id].append(group)

    # for _, v in groups.items():
    #     for x in v:
    #         x.sub_features = []
    records = [
        SeqRecord(id=k, name="", description="", seq=Seq(""), features=v)
        for k, v in groups.items()
    ]
    GFF.write(tqdm(records),
              open("/data/organismos/ILEX_PARA2/regulation/grouped.gff", "w"))