def annotate_sequence(seqrecord, features=['gene', 'RNA structure', 'other']):
    '''Annotate a consensus with the genes and stuff (in place)'''
    from Bio.SeqFeature import SeqFeature, FeatureLocation, CompoundLocation
    from hivwholeseq.utils.genome_info import gene_edges, RNA_structure_edges, \
            other_edges, find_region_edges, find_region_edges_multiple
    from hivwholeseq.data.primers import primers_PCR as primers_PCR_edges
    edge_dict = {'gene': gene_edges,
                 'RNA structure': RNA_structure_edges,
                 'PCR primers': primers_PCR_edges,
                 'other': other_edges}

    smat = np.array(seqrecord)

    for feature_type in features:
        edges_all = edge_dict[feature_type]
        for name, edges in edges_all.iteritems():
            # Skip a feature if it's present already
            if name in map(lambda x: x.id, seqrecord.features):
                continue

            # Behave differently for unsplit regions and split ones
            if len(edges) == 2:
                # LTR problems with F6
                if 'F6' in name:
                    pos_edge = find_region_edges(smat[::-1], [edges[1][::-1], edges[0][::-1]])
                    pos_edge = [len(smat) - 1 - pos_edge[1], len(smat) - 1 - pos_edge[0]]
                else:
                    pos_edge = find_region_edges(smat, edges)
                location = FeatureLocation(*pos_edge)
            else:
                pos_edges = find_region_edges_multiple(smat, edges)
                locations = [FeatureLocation(*pos_edge) for pos_edge in pos_edges]
                location = CompoundLocation(locations)
            feature = SeqFeature(location, type=feature_type, id=name, strand=1)
            seqrecord.features.append(feature)
Ejemplo n.º 2
0
def annotate_sequence(seqrecord, features=['gene', 'RNA structure', 'other']):
    '''Annotate a consensus with the genes and stuff (in place)'''
    from Bio.SeqFeature import SeqFeature, FeatureLocation, CompoundLocation
    from hivwholeseq.utils.genome_info import gene_edges, RNA_structure_edges, \
            other_edges, find_region_edges, find_region_edges_multiple
    from hivwholeseq.data.primers import primers_PCR as primers_PCR_edges
    edge_dict = {
        'gene': gene_edges,
        'RNA structure': RNA_structure_edges,
        'PCR primers': primers_PCR_edges,
        'other': other_edges
    }

    smat = np.array(seqrecord)

    for feature_type in features:
        edges_all = edge_dict[feature_type]
        for name, edges in edges_all.iteritems():
            # Skip a feature if it's present already
            if name in map(lambda x: x.id, seqrecord.features):
                continue

            # Behave differently for unsplit regions and split ones
            if len(edges) == 2:
                # LTR problems with F6
                if 'F6' in name:
                    pos_edge = find_region_edges(
                        smat[::-1], [edges[1][::-1], edges[0][::-1]])
                    pos_edge = [
                        len(smat) - 1 - pos_edge[1],
                        len(smat) - 1 - pos_edge[0]
                    ]
                else:
                    pos_edge = find_region_edges(smat, edges)
                location = FeatureLocation(*pos_edge)
            else:
                pos_edges = find_region_edges_multiple(smat, edges)
                locations = [
                    FeatureLocation(*pos_edge) for pos_edge in pos_edges
                ]
                location = CompoundLocation(locations)
            feature = SeqFeature(location,
                                 type=feature_type,
                                 id=name,
                                 strand=1)
            seqrecord.features.append(feature)
def annotate_sequence(seqrecord, additional_edges={}, additional_features=['chunk'], VERBOSE=0):
    '''Annotate a consensus with the genes and stuff (in place)'''
    # TODO: what do we do with genes that do not start/end where they are
    # supposed to? Do we follow biology and track their new locations?
    from Bio.SeqFeature import SeqFeature, FeatureLocation, CompoundLocation
    from hivwholeseq.utils.genome_info import gene_edges, RNA_structure_edges, \
            other_edges, find_region_edges, find_region_edges_multiple, \
            locate_gene
    edge_dict = {'gene': gene_edges,
                 'RNA structure': RNA_structure_edges,
                 'other': other_edges}
    edge_dict.update(additional_edges)
    additional_features = ['protein'] + additional_features
    features = edge_dict.keys() + additional_features

    if VERBOSE:
        print 'Features:', ', '.join(features)

    smat = np.array(seqrecord)

    for feature_type in edge_dict:
        edges_all = edge_dict[feature_type]
        print feature_type, edge_dict[feature_type].keys()
        for name, edges in edges_all.iteritems():
            if VERBOSE >= 2:
                print name,

            # Skip a feature if it's present already
            if name in map(lambda x: x.id, seqrecord.features):
                if VERBOSE >= 2:
                    print 'already present.'
                continue

            # Behave differently for unsplit regions and split ones
            if len(edges) == 2:
                # LTR problems with F6
                if 'F6' in name:
                    pos_edge = find_region_edges(smat[6000::], [edges[0], None])
                    pos_edge[0] += 6000
                elif feature_type == 'genes':
                    pos_edge = locate_gene(smat, name, output_compact=True)
                else:
                    pos_edge = find_region_edges(smat, edges)

                # Cut the primers for some features
                if (None not in pos_edge) and name in ['V1', 'V3', 'V4', 'V5']:
                    pos_edge[0] += len(edges[0])
                    pos_edge[1] -= len(edges[1])

                # Cut only the right primer for V2
                if (None not in pos_edge) and name in ['V2']:
                    pos_edge[1] -= len(edges[1])

                if pos_edge[0] is None:
                    if name not in ['F1', "LTR5'"]:
                        print 'WARNING: start not found'
                    pos_edge[0] = 0

                if pos_edge[1] is None:
                    if name not in ['F6', "LTR3'"]:
                        print 'WARNING: end not found'
                    pos_edge[1] = len(smat)

                location = FeatureLocation(*pos_edge)
            else:
                if feature_type == 'genes':
                    pos_edges = [locate_gene(smat, name+suff, output_compact=True)
                                 for suff in ('1', '2')]
                else:
                    pos_edges = find_region_edges_multiple(smat, edges, min_distance=1)
                locations = [FeatureLocation(*pos_edge) for pos_edge in pos_edges]
                location = CompoundLocation(locations)

            if VERBOSE >= 2:
                print 'found:', location

            feature = SeqFeature(location, type=feature_type, id=name, strand=1)
            seqrecord.features.append(feature)

    # Add proteins and other features from HXB2
    from operator import attrgetter
    from seqanpy import align_overlap
    from hivwholeseq.utils.genome_info import proteins, chunks
    from hivwholeseq.reference import load_custom_reference
    additional_features_dict = {}
    if 'protein' in additional_features:
        additional_features_dict['protein'] = proteins
    if 'chunk' in additional_features:
        additional_features_dict['chunk'] = chunks

    ref_ann = load_custom_reference('HXB2', 'gb')
    for feagroup, additional_features_grp in additional_features_dict.iteritems():
        for feaname in additional_features_grp:
            if VERBOSE >= 2:
                print feaname,

            fea = ref_ann.features[map(attrgetter('id'), ref_ann.features).index(feaname)]
            seq = fea.extract(ref_ann)
            (score, ali1, ali2) = align_overlap(seqrecord, seq, score_gapopen=-20)
            start = len(ali2) - len(ali2.lstrip('-'))
            end = len(ali2.rstrip('-'))
            end -= ali1[start: end].count('-')

            location = FeatureLocation(start, end)
            if VERBOSE >= 2:
                print 'found:', location

            feature = SeqFeature(location, type=feagroup, id=feaname, strand=1)
            seqrecord.features.append(feature)
Ejemplo n.º 4
0
def annotate_sequence(seqrecord,
                      additional_edges={},
                      additional_features=['chunk'],
                      VERBOSE=0):
    '''Annotate a consensus with the genes and stuff (in place)'''
    # TODO: what do we do with genes that do not start/end where they are
    # supposed to? Do we follow biology and track their new locations?
    from Bio.SeqFeature import SeqFeature, FeatureLocation, CompoundLocation
    from hivwholeseq.utils.genome_info import gene_edges, RNA_structure_edges, \
            other_edges, find_region_edges, find_region_edges_multiple, \
            locate_gene
    edge_dict = {
        'gene': gene_edges,
        'RNA structure': RNA_structure_edges,
        'other': other_edges
    }
    edge_dict.update(additional_edges)
    additional_features = ['protein'] + additional_features
    features = edge_dict.keys() + additional_features

    if VERBOSE:
        print 'Features:', ', '.join(features)

    smat = np.array(seqrecord)

    for feature_type in edge_dict:
        edges_all = edge_dict[feature_type]
        print feature_type, edge_dict[feature_type].keys()
        for name, edges in edges_all.iteritems():
            if VERBOSE >= 2:
                print name,

            # Skip a feature if it's present already
            if name in map(lambda x: x.id, seqrecord.features):
                if VERBOSE >= 2:
                    print 'already present.'
                continue

            # Behave differently for unsplit regions and split ones
            if len(edges) == 2:
                # LTR problems with F6
                if 'F6' in name:
                    pos_edge = find_region_edges(smat[6000::],
                                                 [edges[0], None])
                    pos_edge[0] += 6000
                elif feature_type == 'genes':
                    pos_edge = locate_gene(smat, name, output_compact=True)
                else:
                    pos_edge = find_region_edges(smat, edges)

                # Cut the primers for some features
                if (None not in pos_edge) and name in ['V1', 'V3', 'V4', 'V5']:
                    pos_edge[0] += len(edges[0])
                    pos_edge[1] -= len(edges[1])

                # Cut only the right primer for V2
                if (None not in pos_edge) and name in ['V2']:
                    pos_edge[1] -= len(edges[1])

                if pos_edge[0] is None:
                    if name not in ['F1', "LTR5'"]:
                        print 'WARNING: start not found'
                    pos_edge[0] = 0

                if pos_edge[1] is None:
                    if name not in ['F6', "LTR3'"]:
                        print 'WARNING: end not found'
                    pos_edge[1] = len(smat)

                location = FeatureLocation(*pos_edge)
            else:
                if feature_type == 'genes':
                    pos_edges = [
                        locate_gene(smat, name + suff, output_compact=True)
                        for suff in ('1', '2')
                    ]
                else:
                    pos_edges = find_region_edges_multiple(smat,
                                                           edges,
                                                           min_distance=1)
                locations = [
                    FeatureLocation(*pos_edge) for pos_edge in pos_edges
                ]
                location = CompoundLocation(locations)

            if VERBOSE >= 2:
                print 'found:', location

            feature = SeqFeature(location,
                                 type=feature_type,
                                 id=name,
                                 strand=1)
            seqrecord.features.append(feature)

    # Add proteins and other features from HXB2
    from operator import attrgetter
    from seqanpy import align_overlap
    from hivwholeseq.utils.genome_info import proteins, chunks
    from hivwholeseq.reference import load_custom_reference
    additional_features_dict = {}
    if 'protein' in additional_features:
        additional_features_dict['protein'] = proteins
    if 'chunk' in additional_features:
        additional_features_dict['chunk'] = chunks

    ref_ann = load_custom_reference('HXB2', 'gb')
    for feagroup, additional_features_grp in additional_features_dict.iteritems(
    ):
        for feaname in additional_features_grp:
            if VERBOSE >= 2:
                print feaname,

            fea = ref_ann.features[map(attrgetter('id'),
                                       ref_ann.features).index(feaname)]
            seq = fea.extract(ref_ann)
            (score, ali1, ali2) = align_overlap(seqrecord,
                                                seq,
                                                score_gapopen=-20)
            start = len(ali2) - len(ali2.lstrip('-'))
            end = len(ali2.rstrip('-'))
            end -= ali1[start:end].count('-')

            location = FeatureLocation(start, end)
            if VERBOSE >= 2:
                print 'found:', location

            feature = SeqFeature(location, type=feagroup, id=feaname, strand=1)
            seqrecord.features.append(feature)