def shift_mapco(mapco, refname, region):
    '''Shift coordinate map to the beginning of the reference sequence'''
    from hivwholeseq.reference import load_custom_reference
    refseq = load_custom_reference(refname, format='gb')
    for feature in refseq.features:
        if feature.id == region:
            startref = feature.location.nofuzzy_start
            mapco[:, 0] += startref
            break
def annotate_like_HXB2(refname, VERBOSE=0):
    '''Annotate copying from HXB2'''
    hxb2 = load_custom_reference('HXB2', 'gb')
    ref = load_custom_reference(refname, 'fasta')
    refs = str(ref.seq)

    def get_sublocation(sublocation):
        hxb2_seq = sublocation.extract(hxb2)
        ref_seq = trim_to_refseq(refs, hxb2_seq).replace('-', '')
        start = refs.find(ref_seq)
        end = start + len(ref_seq)
        return FeatureLocation(start, end, strand=+1)

    for fea in hxb2.features:
        if VERBOSE >= 1:
            print fea.id
        loc = [get_sublocation(loc) for loc in fea.location.parts]
        if len(loc) == 1:
            loc = loc[0]
        else:
            loc = CompoundLocation(loc)

        feature = SeqFeature(loc, type=fea.type, id=fea.id)

        # Test length of old and new
        if fea.id not in ["LTR5'", "LTR3'", 'V4']:
            L1 = len(fea.extract(hxb2))
            L2 = len(feature.extract(ref))
            s = str(L2) + ' vs ' + str(L1)
            if 1.0 * L2 / L1 < 0.9:
                raise ValueError('Feature: ' + fea.id + ' is too short: ' + s)
            elif 1.0 * L2 / L1 > 1.1:
                raise ValueError('Feature: ' + fea.id + ' is too long: ' + s)

        ref.features.append(feature)

    return ref
def annotate_like_HXB2(refname, VERBOSE=0):
    '''Annotate copying from HXB2'''
    hxb2 = load_custom_reference('HXB2', 'gb')
    ref = load_custom_reference(refname, 'fasta')
    refs = str(ref.seq)

    def get_sublocation(sublocation):
        hxb2_seq = sublocation.extract(hxb2)
        ref_seq = trim_to_refseq(refs, hxb2_seq).replace('-', '')
        start = refs.find(ref_seq)
        end = start + len(ref_seq)
        return FeatureLocation(start, end, strand=+1)

    for fea in hxb2.features:
        if VERBOSE >= 1:
            print fea.id
        loc = [get_sublocation(loc) for loc in fea.location.parts]
        if len(loc) == 1:
            loc = loc[0]
        else:
            loc = CompoundLocation(loc)

        feature = SeqFeature(loc, type=fea.type, id=fea.id)

        # Test length of old and new
        if fea.id not in ["LTR5'", "LTR3'", 'V4']:
            L1 = len(fea.extract(hxb2))
            L2 = len(feature.extract(ref))
            s = str(L2)+' vs '+str(L1)
            if 1.0 * L2 / L1 < 0.9:
                raise ValueError('Feature: '+fea.id+' is too short: '+s)
            elif 1.0 * L2 / L1 > 1.1:
                raise ValueError('Feature: '+fea.id+' is too long: '+s)

        ref.features.append(feature)

    return ref
def build_reference_alignments(region, refname,
                               VERBOSE=0,
                               subtypes=['B', 'C', 'A', 'AE', 'F1', 'D', 'O', 'H'],
                               codon_align=False,
                               require_full_cover=True,
                              ):
    '''Build reference alignment by subtype'''
    from hivwholeseq.reference import load_custom_reference
    from Bio import SeqIO
    from Bio.Align import MultipleSeqAlignment

    ref = load_custom_reference(refname, region=region)
    refstr = ''.join(ref)

    seq_by_subtype = defaultdict(list)

    fn_in = get_raw_LANL_sequences_filename(region)
    if VERBOSE >= 2:
        print fn_in

    seq_iter = SeqIO.parse(fn_in, 'fasta')

    for i, seq in enumerate(seq_iter):
        if VERBOSE >= 1:
            if not ((i+1) % 100):
                print i+1

        subtype = seq.id.split('.')[0]

        if subtype not in subtypes:
            continue

        if VERBOSE >= 3:
            print subtype

        try:
            rec = align_to_reference(seq, refstr, VERBOSE=VERBOSE,
                                     require_full_cover=require_full_cover,
                                     codon_align=codon_align)
        except ValueError:
            continue

        seq_by_subtype[subtype].append(rec)

    for subtype, seqs in seq_by_subtype.iteritems():
        seq_by_subtype[subtype] = MultipleSeqAlignment(seqs)

    return seq_by_subtype
def get_gene_HXB2(genename):
    '''Get a gene or exon in HXB2'''
    from operator import attrgetter
    from hivwholeseq.reference import load_custom_reference
    HXB2 = load_custom_reference('HXB2', format='gb')
    if genename not in ('tat1', 'tat2', 'rev1', 'rev2'):
        gene_coord = HXB2.features[map(attrgetter('id'), HXB2.features).index(genename)]
        gene_HXB2 = gene_coord.extract(HXB2)
        return gene_HXB2

    else:
        exon_n = int(genename[-1])
        genename = genename[:-1]
        gene_coord = HXB2.features[map(attrgetter('id'), HXB2.features).index(genename)]
        exon_coord = gene_coord.location.parts[exon_n - 1]
        exon_HXB2 = exon_coord.extract(HXB2)
        return exon_HXB2
def get_gene_HXB2(genename):
    '''Get a gene or exon in HXB2'''
    from operator import attrgetter
    from hivwholeseq.reference import load_custom_reference
    HXB2 = load_custom_reference('HXB2', format='gb')
    if genename not in ('tat1', 'tat2', 'rev1', 'rev2'):
        gene_coord = HXB2.features[map(attrgetter('id'),
                                       HXB2.features).index(genename)]
        gene_HXB2 = gene_coord.extract(HXB2)
        return gene_HXB2

    else:
        exon_n = int(genename[-1])
        genename = genename[:-1]
        gene_coord = HXB2.features[map(attrgetter('id'),
                                       HXB2.features).index(genename)]
        exon_coord = gene_coord.location.parts[exon_n - 1]
        exon_HXB2 = exon_coord.extract(HXB2)
        return exon_HXB2
Beispiel #7
0
def check_protein(fea, seqgw, VERBOSE=0, delta_pos=2.5):
    '''Check a protein annotation'''
    seq = fea.extract(seqgw).seq

    if len(seq) % 3:
        raise ValueError('The length of ' + fea.id + ' is not a multiple of 3')

    if 'N' in seq:
        raise ValueError('N nucleotides found in ' + fea.id)

    if '-' in seq:
        raise ValueError('Gaps found in ' + fea.id)

    prot = seq.translate()

    if ('*' in prot) and (prot.find('*') != len(prot) - 1):
        raise ValueError('Premature stops found in ' + fea.id)

    if 'X' in prot:
        raise ValueError('X amino acids found in ' + fea.id)

    # Compare to HXB2
    from hivwholeseq.reference import load_custom_reference
    ref = load_custom_reference('HXB2', region=fea.id)

    from seqanpy import align_global
    (score, alis, alir) = align_global(seq, ref, score_gapopen=-20)
    if VERBOSE >= 3:
        from hivwholeseq.utils.sequence import pretty_print_pairwise_ali
        pretty_print_pairwise_ali((alir, alis),
                                  name1='HXB2',
                                  name2='seq',
                                  width=100)

    scoremax = 3 * len(alis)
    delta = scoremax - score
    if delta > delta_pos * len(alis):
        raise ValueError('The sequence of ' + fea.id +
                         ' looks different from HXB2')
def check_protein(fea, seqgw, VERBOSE=0, delta_pos=2.5):
    '''Check a protein annotation'''
    seq = fea.extract(seqgw).seq

    if len(seq) % 3:
        raise ValueError('The length of '+fea.id+' is not a multiple of 3')

    if 'N' in seq:
        raise ValueError('N nucleotides found in '+fea.id)

    if '-' in seq:
        raise ValueError('Gaps found in '+fea.id)

    prot = seq.translate()

    if ('*' in prot) and (prot.find('*') != len(prot) - 1):
        raise ValueError('Premature stops found in '+fea.id)

    if 'X' in prot:
        raise ValueError('X amino acids found in '+fea.id)

    # Compare to HXB2
    from hivwholeseq.reference import load_custom_reference
    ref = load_custom_reference('HXB2', region=fea.id)

    from seqanpy import align_global
    (score, alis, alir) = align_global(seq, ref, score_gapopen=-20)
    if VERBOSE >= 3:
        from hivwholeseq.utils.sequence import pretty_print_pairwise_ali
        pretty_print_pairwise_ali((alir, alis), name1='HXB2', name2='seq',
                                  width=100)

    scoremax = 3 * len(alis)
    delta = scoremax - score
    if delta > delta_pos * len(alis):
        raise ValueError('The sequence of '+fea.id+' looks different from HXB2')
    maps_coord = defaultdict(dict)
    for pname, patient in patients.iterrows():
        patient = Patient(patient)

        # Make maps for all annotations if not explicit
        if regions is None:
            patseqann = patient.get_reference('genomewide', format='gb')
            regionspat = map(attrgetter('id'), patseqann.features) + ['genomewide']
        else:
            regionspat = regions

        for region in regionspat:
            if VERBOSE >= 1:
                print pname, region

            refseq = load_custom_reference(refname, format='gb', region=region)
            patseq = patient.get_reference(region)

            mapco = build_coordinate_map(refseq, patseq, VERBOSE=VERBOSE)
            mapco = np.array(mapco, int)
            shift_mapco(mapco, refname, region)

            maps_coord[(region, pname)] = mapco 

            if save_to_file:
                out_fn = get_coordinate_map_filename(pname, region, refname=refname)
                np.savetxt(out_fn, mapco, fmt='%d',
                           delimiter='\t',
                           header=refname+'\t'+pname+'_'+region)
                if VERBOSE:
                    print 'Saved to file:', pname, region
Beispiel #10
0
    bins = np.exp(tbins)/(1+np.exp(tbins))
    binsc = np.sqrt(bins[1:] * bins[:-1])
    binw = np.diff(bins)
    hists = np.zeros((len(S_bins) - 1, len(binsc)))

    if VERBOSE >= 1:
        print 'Load alignment, reference, and coordinate map'
    ali = load_custom_alignment('HIV1_FLT_2013_genome_DNA')
    alim = np.array(ali, 'S1')
    S = np.zeros(alim.shape[1])
    for a in alpha[:5]:
        nu = (alim == a).mean(axis=0)
        S -= nu * np.log2(nu + 1e-8)

    refname = 'HXB2'
    refseq = load_custom_reference('HXB2', format='gb')
    mapali = get_coordinate_map(refname, ali)
    if len(refseq) != mapali[0, -1] + 1:
        raise ValueError('Reference '+refname+' in alignment is not complete')
    Sref = S[mapali[1]]

    Srefind = - np.ones(len(Sref), int)
    for i, Sbin in enumerate(S_bins[:-2]):
        ind = (Sref >= Sbin) & (Sref < S_bins[i + 1])
        Srefind[ind] = i
    Srefind[Srefind < 0] = len(S_bins) - 2
    
    if not fragments:
        fragments = ['F'+str(i) for i in xrange(1, 7)]
    if VERBOSE >= 2:
        print 'fragments', fragments
    edges_chunk = all_edges[name]
    edges = find_region_edges(smat, edges_chunk)
    # Some features must be stripped of primers
    if name in ['V3']:
        edges[0] += len(edges_chunk[0])
        edges[1] -= len(edges_chunk[1])

    return [edges]



# Script
if __name__ == '__main__':

    seqold = load_custom_reference('HXB2', 'gb')

    seqnew = load_custom_reference('HXB2', 'fasta')
    smat = np.array(seqnew)

    print 'Add features'
    for typ, coord_typ in coordinates.iteritems():
        for name, edges in coord_typ.iteritems():
            # If coordinates are missing, grab primers
            if edges is None:
                edges = get_coordinates_feature(smat, name)

            if len(edges) == 1:
                fea = SeqFeature(FeatureLocation(edges[0][0], edges[0][1], strand=+1),
                                 type=typ,
                                 id=name)
    regions = args.regions
    VERBOSE = args.verbose
    use_save = args.save
    use_joint = args.joint

    patients = load_patients()
    if pnames is not None:
        patients = patients.iloc[patients.index.isin(pnames)]

    pcodes = [p.code for _, p in patients.iterrows()]

    for region in regions:
        if use_joint:
            seqs_all = []
            for refname in _refs:
                seq = load_custom_reference(refname, region=region)
                if refname == 'F10':
                    refname = 'pZM246F_10'
                seq.id = 'Isolate_'+refname
                seq.name = 'Isolate_'+refname
                seq.description = 'Isolate: '+refname
                if refname in ['38540', 'pZM246F_10']:
                    seq.subtype = 'C'
                else:
                    seq.subtype = 'B'
                seqs_all.append(seq)

        for pname, patient in iterpatient(patients):
            patient.discard_nonsequenced_samples()
            if VERBOSE >= 1:
                print region, pname
                else:
                    PCRs_sample = [PCR]
                for PCR_sample in PCRs_sample:
                    bamfilename = sample.get_mapped_filtered_filename(fragment, PCR=PCR_sample, decontaminated=False)
                    if not os.path.isfile(bamfilename):
                        continue

                    # if check_already_decontaminated(sample, fragment, PCR_sample):
                    #    continue

                    fork_self(samplename, fragment, VERBOSE=VERBOSE, maxreads=maxreads, summary=summary, PCR=PCR_sample)

        sys.exit()

    for fragment in fragments:
        consensi = {refname: "".join(load_custom_reference(refname + "_" + fragment)) for refname in refnames}
        for samplename, sample in samples.iterrows():
            sample = SamplePat(sample)
            try:
                consensi[samplename] = sample.get_consensus(fragment, PCR=1)
            except IOError:
                print samplename, "file not found"
                continue

        for samplename, sample in samples_focal.iterrows():
            sample = SamplePat(sample)
            pname = sample.patient

            if PCR is None:
                PCRs_sample = (1, 2)
            else:
                    #if check_already_decontaminated(sample, fragment, PCR_sample):
                    #    continue

                    fork_self(samplename,
                              fragment,
                              VERBOSE=VERBOSE,
                              maxreads=maxreads,
                              summary=summary,
                              PCR=PCR_sample)

        sys.exit()

    for fragment in fragments:
        consensi = {
            refname: ''.join(load_custom_reference(refname + '_' + fragment))
            for refname in refnames
        }
        for samplename, sample in samples.iterrows():
            sample = SamplePat(sample)
            try:
                consensi[samplename] = sample.get_consensus(fragment, PCR=1)
            except IOError:
                print samplename, 'file not found'
                continue

        for samplename, sample in samples_focal.iterrows():
            sample = SamplePat(sample)
            pname = sample.patient

            if PCR is None:
Beispiel #15
0
    for pname, patient in patients.iterrows():
        patient = Patient(patient)

        # Make maps for all annotations if not explicit
        if regions is None:
            patseqann = patient.get_reference('genomewide', format='gb')
            regionspat = map(attrgetter('id'),
                             patseqann.features) + ['genomewide']
        else:
            regionspat = regions

        for region in regionspat:
            if VERBOSE >= 1:
                print pname, region

            refseq = load_custom_reference(refname, format='gb', region=region)
            patseq = patient.get_reference(region)

            mapco = build_coordinate_map(refseq, patseq, VERBOSE=VERBOSE)
            mapco = np.array(mapco, int)
            shift_mapco(mapco, refname, region)

            maps_coord[(region, pname)] = mapco

            if save_to_file:
                out_fn = get_coordinate_map_filename(pname,
                                                     region,
                                                     refname=refname)
                np.savetxt(out_fn,
                           mapco,
                           fmt='%d',
    regions = args.regions
    VERBOSE = args.verbose
    use_save = args.save
    use_joint = args.joint

    patients = load_patients()
    if pnames is not None:
        patients = patients.iloc[patients.index.isin(pnames)]

    pcodes = [p.code for _, p in patients.iterrows()]

    for region in regions:
        if use_joint:
            seqs_all = []
            for refname in _refs:
                seq = load_custom_reference(refname, region=region)
                if refname == 'F10':
                    refname = 'pZM246F_10'
                seq.id = 'Isolate_' + refname
                seq.name = 'Isolate_' + refname
                seq.description = 'Isolate: ' + refname
                if refname in ['38540', 'pZM246F_10']:
                    seq.subtype = 'C'
                else:
                    seq.subtype = 'B'
                seqs_all.append(seq)

        for pname, patient in iterpatient(patients):
            patient.discard_nonsequenced_samples()
            if VERBOSE >= 1:
                print region, pname
def annotate_sequence(seqrecord, additional_edges={}, additional_features=['chunk'], VERBOSE=0):
    '''Annotate a consensus with the genes and stuff (in place)'''
    # TODO: what do we do with genes that do not start/end where they are
    # supposed to? Do we follow biology and track their new locations?
    from Bio.SeqFeature import SeqFeature, FeatureLocation, CompoundLocation
    from hivwholeseq.utils.genome_info import gene_edges, RNA_structure_edges, \
            other_edges, find_region_edges, find_region_edges_multiple, \
            locate_gene
    edge_dict = {'gene': gene_edges,
                 'RNA structure': RNA_structure_edges,
                 'other': other_edges}
    edge_dict.update(additional_edges)
    additional_features = ['protein'] + additional_features
    features = edge_dict.keys() + additional_features

    if VERBOSE:
        print 'Features:', ', '.join(features)

    smat = np.array(seqrecord)

    for feature_type in edge_dict:
        edges_all = edge_dict[feature_type]
        print feature_type, edge_dict[feature_type].keys()
        for name, edges in edges_all.iteritems():
            if VERBOSE >= 2:
                print name,

            # Skip a feature if it's present already
            if name in map(lambda x: x.id, seqrecord.features):
                if VERBOSE >= 2:
                    print 'already present.'
                continue

            # Behave differently for unsplit regions and split ones
            if len(edges) == 2:
                # LTR problems with F6
                if 'F6' in name:
                    pos_edge = find_region_edges(smat[6000::], [edges[0], None])
                    pos_edge[0] += 6000
                elif feature_type == 'genes':
                    pos_edge = locate_gene(smat, name, output_compact=True)
                else:
                    pos_edge = find_region_edges(smat, edges)

                # Cut the primers for some features
                if (None not in pos_edge) and name in ['V1', 'V3', 'V4', 'V5']:
                    pos_edge[0] += len(edges[0])
                    pos_edge[1] -= len(edges[1])

                # Cut only the right primer for V2
                if (None not in pos_edge) and name in ['V2']:
                    pos_edge[1] -= len(edges[1])

                if pos_edge[0] is None:
                    if name not in ['F1', "LTR5'"]:
                        print 'WARNING: start not found'
                    pos_edge[0] = 0

                if pos_edge[1] is None:
                    if name not in ['F6', "LTR3'"]:
                        print 'WARNING: end not found'
                    pos_edge[1] = len(smat)

                location = FeatureLocation(*pos_edge)
            else:
                if feature_type == 'genes':
                    pos_edges = [locate_gene(smat, name+suff, output_compact=True)
                                 for suff in ('1', '2')]
                else:
                    pos_edges = find_region_edges_multiple(smat, edges, min_distance=1)
                locations = [FeatureLocation(*pos_edge) for pos_edge in pos_edges]
                location = CompoundLocation(locations)

            if VERBOSE >= 2:
                print 'found:', location

            feature = SeqFeature(location, type=feature_type, id=name, strand=1)
            seqrecord.features.append(feature)

    # Add proteins and other features from HXB2
    from operator import attrgetter
    from seqanpy import align_overlap
    from hivwholeseq.utils.genome_info import proteins, chunks
    from hivwholeseq.reference import load_custom_reference
    additional_features_dict = {}
    if 'protein' in additional_features:
        additional_features_dict['protein'] = proteins
    if 'chunk' in additional_features:
        additional_features_dict['chunk'] = chunks

    ref_ann = load_custom_reference('HXB2', 'gb')
    for feagroup, additional_features_grp in additional_features_dict.iteritems():
        for feaname in additional_features_grp:
            if VERBOSE >= 2:
                print feaname,

            fea = ref_ann.features[map(attrgetter('id'), ref_ann.features).index(feaname)]
            seq = fea.extract(ref_ann)
            (score, ali1, ali2) = align_overlap(seqrecord, seq, score_gapopen=-20)
            start = len(ali2) - len(ali2.lstrip('-'))
            end = len(ali2.rstrip('-'))
            end -= ali1[start: end].count('-')

            location = FeatureLocation(start, end)
            if VERBOSE >= 2:
                print 'found:', location

            feature = SeqFeature(location, type=feagroup, id=feaname, strand=1)
            seqrecord.features.append(feature)
Beispiel #18
0
def annotate_sequence(seqrecord,
                      additional_edges={},
                      additional_features=['chunk'],
                      VERBOSE=0):
    '''Annotate a consensus with the genes and stuff (in place)'''
    # TODO: what do we do with genes that do not start/end where they are
    # supposed to? Do we follow biology and track their new locations?
    from Bio.SeqFeature import SeqFeature, FeatureLocation, CompoundLocation
    from hivwholeseq.utils.genome_info import gene_edges, RNA_structure_edges, \
            other_edges, find_region_edges, find_region_edges_multiple, \
            locate_gene
    edge_dict = {
        'gene': gene_edges,
        'RNA structure': RNA_structure_edges,
        'other': other_edges
    }
    edge_dict.update(additional_edges)
    additional_features = ['protein'] + additional_features
    features = edge_dict.keys() + additional_features

    if VERBOSE:
        print 'Features:', ', '.join(features)

    smat = np.array(seqrecord)

    for feature_type in edge_dict:
        edges_all = edge_dict[feature_type]
        print feature_type, edge_dict[feature_type].keys()
        for name, edges in edges_all.iteritems():
            if VERBOSE >= 2:
                print name,

            # Skip a feature if it's present already
            if name in map(lambda x: x.id, seqrecord.features):
                if VERBOSE >= 2:
                    print 'already present.'
                continue

            # Behave differently for unsplit regions and split ones
            if len(edges) == 2:
                # LTR problems with F6
                if 'F6' in name:
                    pos_edge = find_region_edges(smat[6000::],
                                                 [edges[0], None])
                    pos_edge[0] += 6000
                elif feature_type == 'genes':
                    pos_edge = locate_gene(smat, name, output_compact=True)
                else:
                    pos_edge = find_region_edges(smat, edges)

                # Cut the primers for some features
                if (None not in pos_edge) and name in ['V1', 'V3', 'V4', 'V5']:
                    pos_edge[0] += len(edges[0])
                    pos_edge[1] -= len(edges[1])

                # Cut only the right primer for V2
                if (None not in pos_edge) and name in ['V2']:
                    pos_edge[1] -= len(edges[1])

                if pos_edge[0] is None:
                    if name not in ['F1', "LTR5'"]:
                        print 'WARNING: start not found'
                    pos_edge[0] = 0

                if pos_edge[1] is None:
                    if name not in ['F6', "LTR3'"]:
                        print 'WARNING: end not found'
                    pos_edge[1] = len(smat)

                location = FeatureLocation(*pos_edge)
            else:
                if feature_type == 'genes':
                    pos_edges = [
                        locate_gene(smat, name + suff, output_compact=True)
                        for suff in ('1', '2')
                    ]
                else:
                    pos_edges = find_region_edges_multiple(smat,
                                                           edges,
                                                           min_distance=1)
                locations = [
                    FeatureLocation(*pos_edge) for pos_edge in pos_edges
                ]
                location = CompoundLocation(locations)

            if VERBOSE >= 2:
                print 'found:', location

            feature = SeqFeature(location,
                                 type=feature_type,
                                 id=name,
                                 strand=1)
            seqrecord.features.append(feature)

    # Add proteins and other features from HXB2
    from operator import attrgetter
    from seqanpy import align_overlap
    from hivwholeseq.utils.genome_info import proteins, chunks
    from hivwholeseq.reference import load_custom_reference
    additional_features_dict = {}
    if 'protein' in additional_features:
        additional_features_dict['protein'] = proteins
    if 'chunk' in additional_features:
        additional_features_dict['chunk'] = chunks

    ref_ann = load_custom_reference('HXB2', 'gb')
    for feagroup, additional_features_grp in additional_features_dict.iteritems(
    ):
        for feaname in additional_features_grp:
            if VERBOSE >= 2:
                print feaname,

            fea = ref_ann.features[map(attrgetter('id'),
                                       ref_ann.features).index(feaname)]
            seq = fea.extract(ref_ann)
            (score, ali1, ali2) = align_overlap(seqrecord,
                                                seq,
                                                score_gapopen=-20)
            start = len(ali2) - len(ali2.lstrip('-'))
            end = len(ali2.rstrip('-'))
            end -= ali1[start:end].count('-')

            location = FeatureLocation(start, end)
            if VERBOSE >= 2:
                print 'found:', location

            feature = SeqFeature(location, type=feagroup, id=feaname, strand=1)
            seqrecord.features.append(feature)
Beispiel #19
0
                        help='Reference to use for alignment')
    parser.add_argument('--verbose',
                        type=int,
                        default=0,
                        help='Verbosity level [0-4]')
    parser.add_argument('--subtypes',
                        nargs='+',
                        default=['B'],
                        help='Subtypes to keep')

    args = parser.parse_args()
    regions = args.regions
    refname = args.reference
    VERBOSE = args.verbose
    subtypes = args.subtypes

    from hivwholeseq.reference import load_custom_reference
    from hivwholeseq.utils.sequence import find_annotation
    ref = load_custom_reference('HXB2', 'gb')

    for region in regions:
        regm = np.array(find_annotation(ref, region).extract(ref), 'S1')
        for subtype in subtypes:
            fn = get_subtype_reference_alignment_filename(region,
                                                          subtype=subtype,
                                                          refname=refname,
                                                          VERBOSE=VERBOSE)
            alim = np.array(AlignIO.read(fn, 'fasta'), 'S1')
            weird = ((alim != regm).mean(axis=1) > 0.2)
            print region, subtype, weird.sum()
    parser.add_argument('--reference', required=True,
                        help='Reference to analyze (e.g. LAI-III)')
    parser.add_argument('--fragments', nargs='+', default=fragments,
                        help='Fragments to merge')
    parser.add_argument('--save', action='store_true',
                        help='Save to file')
    parser.add_argument('--verbose', type=int, default=0,
                        help='Verbosity level [0-3]')

    args = parser.parse_args()
    refname = args.reference
    use_save = args.save
    VERBOSE = args.verbose


    consensi = [load_custom_reference(refname+'_'+fr) for fr in fragments]
    consensus = merge_sequences(consensi, VERBOSE=VERBOSE)

    from Bio.Seq import Seq
    from Bio.SeqRecord import SeqRecord
    consrec = SeqRecord(Seq(consensus, alphabet=consensi[0].seq.alphabet),
                        id=refname+'_genomewide',
                        name=refname+'_genomewide',
                        description=refname+', genomewide reference (merged)',
                       )

    if use_save:
        if VERBOSE >= 1:
            print 'Save to file'
        from Bio import SeqIO
        fn = get_custom_reference_filename(refname, format='fasta')
def make_reference(data_folder,
                   adaID,
                   fragments,
                   refname,
                   VERBOSE=0,
                   summary=True):
    '''Make reference sequence trimmed to the necessary parts'''
    from hivwholeseq.reference import load_custom_reference
    seq = load_custom_reference(refname)

    output_filename = get_reference_premap_filename(data_folder, adaID)

    if fragments is None:
        seq_trim = seq
    else:
        # Look for the first fwd and the last rev primers to trim the reference
        # NOTE: this works even if F1 or F6 are missing (e.g. only F2-5 are seq-ed)!
        # If more than one primer is used for the first or last fragment, take the
        # longest reference
        from hivwholeseq.data.primers import primers_PCR, primers_coordinates_HXB2
        if '+' in fragments[0]:
            fragment_subs = [
                fragments[0][:2] + fsub + fragments[0][-1]
                for fsub in fragments[0][2:-1].split('+')
            ]
            fr_pos_subs = [
                primers_coordinates_HXB2[fsub][0][0] for fsub in fragment_subs
            ]
            fragments[0] = fragment_subs[np.argmin(fr_pos_subs)]

        pr_fwd = primers_PCR[fragments[0]][0]

        if '+' in fragments[-1]:
            fragment_subs = [
                fragments[-1][:2] + fsub + fragments[-1][-1]
                for fsub in fragments[-1][2:-1].split('+')
            ]
            fr_pos_subs = [
                primers_coordinates_HXB2[fsub][1][1] for fsub in fragment_subs
            ]
            fragments[-1] = fragment_subs[np.argmax(fr_pos_subs)]

        pr_rev = primers_PCR[fragments[-1]][1]

        smat = np.array(seq)

        # Get all possible primers from ambiguous nucleotides and get the best match
        from hivwholeseq.utils.sequence import expand_ambiguous_seq as eas
        pr_fwd_mat = np.array(map(list, eas(pr_fwd)), 'S1')
        n_matches_fwd = [
            (smat[i:i + len(pr_fwd)] == pr_fwd_mat).sum(axis=1).max()
            for i in xrange(len(seq) - len(pr_fwd))
        ]
        pr_fwd_pos = np.argmax(n_matches_fwd)

        pr_rev_mat = np.array(map(list, eas(pr_rev)), 'S1')
        n_matches_rev = [
            (smat[i:i + len(pr_rev)] == pr_rev_mat).sum(axis=1).max()
            for i in xrange(pr_fwd_pos + len(pr_fwd),
                            len(seq) - len(pr_rev))
        ]
        # Here you come from the right, i.e. look in the 3' LTR first
        pr_rev_pos = len(seq) - len(pr_rev) - 1 - np.argmax(
            n_matches_rev[::-1])

        output = [['Reference name:', refname]]
        output.append(['FWD primer:', fragments[0], str(pr_fwd_pos), pr_fwd])
        output.append(['REV primer:', fragments[-1], str(pr_rev_pos), pr_rev])
        output = '\n'.join(map(' '.join, output))
        if VERBOSE:
            print output

        if summary:
            with open(get_premap_summary_filename(data_folder, adaID),
                      'a') as f:
                f.write(output)
                f.write('\n')

        # The reference includes both the first fwd primer and the last rev one
        seq_trim = seq[pr_fwd_pos:pr_rev_pos + len(pr_rev)]
        seq_trim.id = '_'.join(
            [seq_trim.id,
             str(pr_fwd_pos + 1),
             str(pr_rev_pos + len(pr_rev))])
        seq_trim.name = '_'.join([
            seq_trim.name,
            str(pr_fwd_pos + 1),
            str(pr_rev_pos + len(pr_rev))
        ])
        seq_trim.description = ' '.join([
            seq_trim.description, 'from',
            str(pr_fwd_pos + 1), 'to',
            str(pr_rev_pos + len(pr_rev)),
            '(indices from 1, extremes included)'
        ])

    SeqIO.write(seq_trim, output_filename, 'fasta')

    if summary:
        with open(get_premap_summary_filename(data_folder, adaID), 'a') as f:
            f.write('Reference sequence written to: ' + output_filename)
            f.write('\n')
Beispiel #22
0
    from hivwholeseq.utils.genome_info import all_edges, find_region_edges

    edges_chunk = all_edges[name]
    edges = find_region_edges(smat, edges_chunk)
    # Some features must be stripped of primers
    if name in ['V3']:
        edges[0] += len(edges_chunk[0])
        edges[1] -= len(edges_chunk[1])

    return [edges]


# Script
if __name__ == '__main__':

    seqold = load_custom_reference('HXB2', 'gb')

    seqnew = load_custom_reference('HXB2', 'fasta')
    smat = np.array(seqnew)

    print 'Add features'
    for typ, coord_typ in coordinates.iteritems():
        for name, edges in coord_typ.iteritems():
            # If coordinates are missing, grab primers
            if edges is None:
                edges = get_coordinates_feature(smat, name)

            if len(edges) == 1:
                fea = SeqFeature(FeatureLocation(edges[0][0],
                                                 edges[0][1],
                                                 strand=+1),
Beispiel #23
0
def correlate_epitope_substitution(ds, dctl):
    '''Correlate presence of a substitution with epitope'''
    from hivwholeseq.data.primers import primers_coordinates_HXB2_outer
    start_F1 = primers_coordinates_HXB2_outer['F1'][0][1]
    end_F6 = primers_coordinates_HXB2_outer['F6'][1][0]

    ds = ds.copy()

    dg = []
    for pcode, datum in dctl.groupby('pcode'):
        a = np.arange(start_F1, end_F6)
        b = np.zeros(len(a), bool)
        for _, epi in datum.iterrows():
            b[(a >= epi['start_HXB2']) & (a < epi['end_HXB2'])] = True
        c = np.zeros(len(a), bool)
        datum = ds.loc[ds['pcode'] == pcode]
        # Keep only nonsyn substitutions
        datum = datum.loc[datum['syn'] == False]
        c[datum['pos_ref'] - a[0]] = True
        dat = {
            'pos': a,
            'epitope': b,
            'substitution': c,
        }
        dat = pd.DataFrame(dat)
        dat['pcode'] = pcode
        dg.append(dat)
    dg = pd.concat(dg)

    # Exclude env because it has antibody-related substitutions
    from hivwholeseq.reference import load_custom_reference
    from hivwholeseq.utils.sequence import find_annotation
    ref = load_custom_reference('HXB2', 'gb')
    start_env = find_annotation(ref, 'gp41').location.nofuzzy_start
    end_env = find_annotation(ref, 'gp41').location.nofuzzy_end - 450
    dg = dg.loc[(dg['pos'] < start_env) | (dg['pos'] >= end_env)]

    M = dg.groupby(['epitope', 'substitution']).size().unstack()
    Ma = np.array(M)
    xp = 1.0 * Ma[1, 0] / Ma[0, 0] * Ma[0, 1]
    xs = Ma[1, 1] - xp
    print M
    from scipy.stats import fisher_exact
    print 'Fisher\'s exact enrichment:', fisher_exact(Ma)[0]
    print 'Fisher\'s exact P value:', fisher_exact(Ma)[1]
    print 'expected:', xp
    print 'excess:', xs, 'per patient:', xs / 9.0

    pos_epi = dg.loc[dg['epitope'] == True]['pos'].unique()
    dg2 = dg.loc[dg['pos'].isin(pos_epi)].copy()
    M2 = dg2.groupby(['epitope', 'substitution']).size().unstack()
    M2a = np.array(M2)
    xp = 1.0 * M2a[1, 0] / M2a[0, 0] * M2a[0, 1]
    xs = M2a[1, 1] - xp
    print M2
    print '\nFisher\'s exact enrichment:', fisher_exact(M2a)[0]
    print 'Fisher\'s exact P value:', fisher_exact(M2a)[1]
    print 'expected:', xp
    print 'excess:', xs, 'per patient:', xs / 9.0
    return {
        'dg': dg,
        'dg2': dg2,
    }
    parser.add_argument('--reference', default='HXB2',
                        help='Reference to use for alignment')
    parser.add_argument('--verbose', type=int, default=0,
                        help='Verbosity level [0-4]')
    parser.add_argument('--subtypes', nargs='+', default=['B'],
                        help='Subtypes to keep')

    args = parser.parse_args()
    regions = args.regions
    refname = args.reference
    VERBOSE = args.verbose
    subtypes = args.subtypes


    from hivwholeseq.reference import load_custom_reference
    from hivwholeseq.utils.sequence import find_annotation
    ref = load_custom_reference('HXB2', 'gb')

    for region in regions:
        regm = np.array(find_annotation(ref, region).extract(ref), 'S1')
        for subtype in subtypes:
            fn = get_subtype_reference_alignment_filename(region,
                                                          subtype=subtype,
                                                          refname=refname,
                                                          VERBOSE=VERBOSE)
            alim = np.array(AlignIO.read(fn, 'fasta'), 'S1')
            weird = ((alim != regm).mean(axis=1) > 0.2)
            print region, subtype, weird.sum()
                

def make_reference(data_folder,
                   adaID,
                   fragments,
                   refname,
                   VERBOSE=0,
                   summary=True):
    '''Make reference sequence trimmed to the necessary parts'''
    from hivwholeseq.reference import load_custom_reference
    seq = load_custom_reference(refname)

    output_filename = get_reference_premap_filename(data_folder, adaID)

    if fragments is None:
        seq_trim = seq
    else:
        # Look for the first fwd and the last rev primers to trim the reference
        # NOTE: this works even if F1 or F6 are missing (e.g. only F2-5 are seq-ed)!
        # If more than one primer is used for the first or last fragment, take the
        # longest reference
        from hivwholeseq.data.primers import primers_PCR, primers_coordinates_HXB2
        if '+' in fragments[0]:
            fragment_subs = [
                fragments[0][:2] + fsub + fragments[0][-1]
                for fsub in fragments[0][2:-1].split('+')
            ]
            fr_pos_subs = [
                primers_coordinates_HXB2[fsub][0][0] for fsub in fragment_subs
            ]
            fragments[0] = fragment_subs[np.argmin(fr_pos_subs)]

        pr_fwd = primers_PCR[fragments[0]][0]

        if '+' in fragments[-1]:
            fragment_subs = [
                fragments[-1][:2] + fsub + fragments[-1][-1]
                for fsub in fragments[-1][2:-1].split('+')
            ]
            fr_pos_subs = [
                primers_coordinates_HXB2[fsub][1][1] for fsub in fragment_subs
            ]
            fragments[-1] = fragment_subs[np.argmax(fr_pos_subs)]

        pr_rev = primers_PCR[fragments[-1]][1]

        smat = np.array(seq)

        # Get all possible primers from ambiguous nucleotides and get the best match
        from hivwholeseq.utils.sequence import expand_ambiguous_seq as eas
        pr_fwd_mat = np.array(map(list, eas(pr_fwd)), 'S1')
        n_matches_fwd = [
            (smat[i:i + len(pr_fwd)] == pr_fwd_mat).sum(axis=1).max()
            for i in xrange(len(seq) - len(pr_fwd))
        ]
        pr_fwd_pos = np.argmax(n_matches_fwd)

        pr_rev_mat = np.array(map(list, eas(pr_rev)), 'S1')
        n_matches_rev = [
            (smat[i:i + len(pr_rev)] == pr_rev_mat).sum(axis=1).max()
            for i in xrange(pr_fwd_pos + len(pr_fwd),
                            len(seq) - len(pr_rev))
        ]
        # Here you come from the right, i.e. look in the 3' LTR first
        pr_rev_pos = len(seq) - len(pr_rev) - 1 - np.argmax(
            n_matches_rev[::-1])

        output = [['Reference name:', refname]]
        output.append(['FWD primer:', fragments[0], str(pr_fwd_pos), pr_fwd])
        output.append(['REV primer:', fragments[-1], str(pr_rev_pos), pr_rev])
        output = '\n'.join(map(' '.join, output))
        if VERBOSE:
            print output

        if summary:
            with open(get_premap_summary_filename(data_folder, adaID),
                      'a') as f:
                f.write(output)
                f.write('\n')

        # The reference includes both the first fwd primer and the last rev one
        seq_trim = seq[pr_fwd_pos:pr_rev_pos + len(pr_rev)]
        seq_trim.id = '_'.join(
            [seq_trim.id,
             str(pr_fwd_pos + 1),
             str(pr_rev_pos + len(pr_rev))])
        seq_trim.name = '_'.join([
            seq_trim.name,
            str(pr_fwd_pos + 1),
            str(pr_rev_pos + len(pr_rev))
        ])
        seq_trim.description = ' '.join([
            seq_trim.description, 'from',
            str(pr_fwd_pos + 1), 'to',
            str(pr_rev_pos + len(pr_rev)),
            '(indices from 1, extremes included)'
        ])

    SeqIO.write(seq_trim, output_filename, 'fasta')

    if summary:
        with open(get_premap_summary_filename(data_folder, adaID), 'a') as f:
            f.write('Reference sequence written to: ' + output_filename)
            f.write('\n')
    parser.add_argument('--fragments',
                        nargs='+',
                        default=fragments,
                        help='Fragments to merge')
    parser.add_argument('--save', action='store_true', help='Save to file')
    parser.add_argument('--verbose',
                        type=int,
                        default=0,
                        help='Verbosity level [0-3]')

    args = parser.parse_args()
    refname = args.reference
    use_save = args.save
    VERBOSE = args.verbose

    consensi = [load_custom_reference(refname + '_' + fr) for fr in fragments]
    consensus = merge_sequences(consensi, VERBOSE=VERBOSE)

    from Bio.Seq import Seq
    from Bio.SeqRecord import SeqRecord
    consrec = SeqRecord(
        Seq(consensus, alphabet=consensi[0].seq.alphabet),
        id=refname + '_genomewide',
        name=refname + '_genomewide',
        description=refname + ', genomewide reference (merged)',
    )

    if use_save:
        if VERBOSE >= 1:
            print 'Save to file'
        from Bio import SeqIO