def shift_mapco(mapco, refname, region): '''Shift coordinate map to the beginning of the reference sequence''' from hivwholeseq.reference import load_custom_reference refseq = load_custom_reference(refname, format='gb') for feature in refseq.features: if feature.id == region: startref = feature.location.nofuzzy_start mapco[:, 0] += startref break
def annotate_like_HXB2(refname, VERBOSE=0): '''Annotate copying from HXB2''' hxb2 = load_custom_reference('HXB2', 'gb') ref = load_custom_reference(refname, 'fasta') refs = str(ref.seq) def get_sublocation(sublocation): hxb2_seq = sublocation.extract(hxb2) ref_seq = trim_to_refseq(refs, hxb2_seq).replace('-', '') start = refs.find(ref_seq) end = start + len(ref_seq) return FeatureLocation(start, end, strand=+1) for fea in hxb2.features: if VERBOSE >= 1: print fea.id loc = [get_sublocation(loc) for loc in fea.location.parts] if len(loc) == 1: loc = loc[0] else: loc = CompoundLocation(loc) feature = SeqFeature(loc, type=fea.type, id=fea.id) # Test length of old and new if fea.id not in ["LTR5'", "LTR3'", 'V4']: L1 = len(fea.extract(hxb2)) L2 = len(feature.extract(ref)) s = str(L2) + ' vs ' + str(L1) if 1.0 * L2 / L1 < 0.9: raise ValueError('Feature: ' + fea.id + ' is too short: ' + s) elif 1.0 * L2 / L1 > 1.1: raise ValueError('Feature: ' + fea.id + ' is too long: ' + s) ref.features.append(feature) return ref
def annotate_like_HXB2(refname, VERBOSE=0): '''Annotate copying from HXB2''' hxb2 = load_custom_reference('HXB2', 'gb') ref = load_custom_reference(refname, 'fasta') refs = str(ref.seq) def get_sublocation(sublocation): hxb2_seq = sublocation.extract(hxb2) ref_seq = trim_to_refseq(refs, hxb2_seq).replace('-', '') start = refs.find(ref_seq) end = start + len(ref_seq) return FeatureLocation(start, end, strand=+1) for fea in hxb2.features: if VERBOSE >= 1: print fea.id loc = [get_sublocation(loc) for loc in fea.location.parts] if len(loc) == 1: loc = loc[0] else: loc = CompoundLocation(loc) feature = SeqFeature(loc, type=fea.type, id=fea.id) # Test length of old and new if fea.id not in ["LTR5'", "LTR3'", 'V4']: L1 = len(fea.extract(hxb2)) L2 = len(feature.extract(ref)) s = str(L2)+' vs '+str(L1) if 1.0 * L2 / L1 < 0.9: raise ValueError('Feature: '+fea.id+' is too short: '+s) elif 1.0 * L2 / L1 > 1.1: raise ValueError('Feature: '+fea.id+' is too long: '+s) ref.features.append(feature) return ref
def build_reference_alignments(region, refname, VERBOSE=0, subtypes=['B', 'C', 'A', 'AE', 'F1', 'D', 'O', 'H'], codon_align=False, require_full_cover=True, ): '''Build reference alignment by subtype''' from hivwholeseq.reference import load_custom_reference from Bio import SeqIO from Bio.Align import MultipleSeqAlignment ref = load_custom_reference(refname, region=region) refstr = ''.join(ref) seq_by_subtype = defaultdict(list) fn_in = get_raw_LANL_sequences_filename(region) if VERBOSE >= 2: print fn_in seq_iter = SeqIO.parse(fn_in, 'fasta') for i, seq in enumerate(seq_iter): if VERBOSE >= 1: if not ((i+1) % 100): print i+1 subtype = seq.id.split('.')[0] if subtype not in subtypes: continue if VERBOSE >= 3: print subtype try: rec = align_to_reference(seq, refstr, VERBOSE=VERBOSE, require_full_cover=require_full_cover, codon_align=codon_align) except ValueError: continue seq_by_subtype[subtype].append(rec) for subtype, seqs in seq_by_subtype.iteritems(): seq_by_subtype[subtype] = MultipleSeqAlignment(seqs) return seq_by_subtype
def get_gene_HXB2(genename): '''Get a gene or exon in HXB2''' from operator import attrgetter from hivwholeseq.reference import load_custom_reference HXB2 = load_custom_reference('HXB2', format='gb') if genename not in ('tat1', 'tat2', 'rev1', 'rev2'): gene_coord = HXB2.features[map(attrgetter('id'), HXB2.features).index(genename)] gene_HXB2 = gene_coord.extract(HXB2) return gene_HXB2 else: exon_n = int(genename[-1]) genename = genename[:-1] gene_coord = HXB2.features[map(attrgetter('id'), HXB2.features).index(genename)] exon_coord = gene_coord.location.parts[exon_n - 1] exon_HXB2 = exon_coord.extract(HXB2) return exon_HXB2
def check_protein(fea, seqgw, VERBOSE=0, delta_pos=2.5): '''Check a protein annotation''' seq = fea.extract(seqgw).seq if len(seq) % 3: raise ValueError('The length of ' + fea.id + ' is not a multiple of 3') if 'N' in seq: raise ValueError('N nucleotides found in ' + fea.id) if '-' in seq: raise ValueError('Gaps found in ' + fea.id) prot = seq.translate() if ('*' in prot) and (prot.find('*') != len(prot) - 1): raise ValueError('Premature stops found in ' + fea.id) if 'X' in prot: raise ValueError('X amino acids found in ' + fea.id) # Compare to HXB2 from hivwholeseq.reference import load_custom_reference ref = load_custom_reference('HXB2', region=fea.id) from seqanpy import align_global (score, alis, alir) = align_global(seq, ref, score_gapopen=-20) if VERBOSE >= 3: from hivwholeseq.utils.sequence import pretty_print_pairwise_ali pretty_print_pairwise_ali((alir, alis), name1='HXB2', name2='seq', width=100) scoremax = 3 * len(alis) delta = scoremax - score if delta > delta_pos * len(alis): raise ValueError('The sequence of ' + fea.id + ' looks different from HXB2')
def check_protein(fea, seqgw, VERBOSE=0, delta_pos=2.5): '''Check a protein annotation''' seq = fea.extract(seqgw).seq if len(seq) % 3: raise ValueError('The length of '+fea.id+' is not a multiple of 3') if 'N' in seq: raise ValueError('N nucleotides found in '+fea.id) if '-' in seq: raise ValueError('Gaps found in '+fea.id) prot = seq.translate() if ('*' in prot) and (prot.find('*') != len(prot) - 1): raise ValueError('Premature stops found in '+fea.id) if 'X' in prot: raise ValueError('X amino acids found in '+fea.id) # Compare to HXB2 from hivwholeseq.reference import load_custom_reference ref = load_custom_reference('HXB2', region=fea.id) from seqanpy import align_global (score, alis, alir) = align_global(seq, ref, score_gapopen=-20) if VERBOSE >= 3: from hivwholeseq.utils.sequence import pretty_print_pairwise_ali pretty_print_pairwise_ali((alir, alis), name1='HXB2', name2='seq', width=100) scoremax = 3 * len(alis) delta = scoremax - score if delta > delta_pos * len(alis): raise ValueError('The sequence of '+fea.id+' looks different from HXB2')
maps_coord = defaultdict(dict) for pname, patient in patients.iterrows(): patient = Patient(patient) # Make maps for all annotations if not explicit if regions is None: patseqann = patient.get_reference('genomewide', format='gb') regionspat = map(attrgetter('id'), patseqann.features) + ['genomewide'] else: regionspat = regions for region in regionspat: if VERBOSE >= 1: print pname, region refseq = load_custom_reference(refname, format='gb', region=region) patseq = patient.get_reference(region) mapco = build_coordinate_map(refseq, patseq, VERBOSE=VERBOSE) mapco = np.array(mapco, int) shift_mapco(mapco, refname, region) maps_coord[(region, pname)] = mapco if save_to_file: out_fn = get_coordinate_map_filename(pname, region, refname=refname) np.savetxt(out_fn, mapco, fmt='%d', delimiter='\t', header=refname+'\t'+pname+'_'+region) if VERBOSE: print 'Saved to file:', pname, region
bins = np.exp(tbins)/(1+np.exp(tbins)) binsc = np.sqrt(bins[1:] * bins[:-1]) binw = np.diff(bins) hists = np.zeros((len(S_bins) - 1, len(binsc))) if VERBOSE >= 1: print 'Load alignment, reference, and coordinate map' ali = load_custom_alignment('HIV1_FLT_2013_genome_DNA') alim = np.array(ali, 'S1') S = np.zeros(alim.shape[1]) for a in alpha[:5]: nu = (alim == a).mean(axis=0) S -= nu * np.log2(nu + 1e-8) refname = 'HXB2' refseq = load_custom_reference('HXB2', format='gb') mapali = get_coordinate_map(refname, ali) if len(refseq) != mapali[0, -1] + 1: raise ValueError('Reference '+refname+' in alignment is not complete') Sref = S[mapali[1]] Srefind = - np.ones(len(Sref), int) for i, Sbin in enumerate(S_bins[:-2]): ind = (Sref >= Sbin) & (Sref < S_bins[i + 1]) Srefind[ind] = i Srefind[Srefind < 0] = len(S_bins) - 2 if not fragments: fragments = ['F'+str(i) for i in xrange(1, 7)] if VERBOSE >= 2: print 'fragments', fragments
edges_chunk = all_edges[name] edges = find_region_edges(smat, edges_chunk) # Some features must be stripped of primers if name in ['V3']: edges[0] += len(edges_chunk[0]) edges[1] -= len(edges_chunk[1]) return [edges] # Script if __name__ == '__main__': seqold = load_custom_reference('HXB2', 'gb') seqnew = load_custom_reference('HXB2', 'fasta') smat = np.array(seqnew) print 'Add features' for typ, coord_typ in coordinates.iteritems(): for name, edges in coord_typ.iteritems(): # If coordinates are missing, grab primers if edges is None: edges = get_coordinates_feature(smat, name) if len(edges) == 1: fea = SeqFeature(FeatureLocation(edges[0][0], edges[0][1], strand=+1), type=typ, id=name)
regions = args.regions VERBOSE = args.verbose use_save = args.save use_joint = args.joint patients = load_patients() if pnames is not None: patients = patients.iloc[patients.index.isin(pnames)] pcodes = [p.code for _, p in patients.iterrows()] for region in regions: if use_joint: seqs_all = [] for refname in _refs: seq = load_custom_reference(refname, region=region) if refname == 'F10': refname = 'pZM246F_10' seq.id = 'Isolate_'+refname seq.name = 'Isolate_'+refname seq.description = 'Isolate: '+refname if refname in ['38540', 'pZM246F_10']: seq.subtype = 'C' else: seq.subtype = 'B' seqs_all.append(seq) for pname, patient in iterpatient(patients): patient.discard_nonsequenced_samples() if VERBOSE >= 1: print region, pname
else: PCRs_sample = [PCR] for PCR_sample in PCRs_sample: bamfilename = sample.get_mapped_filtered_filename(fragment, PCR=PCR_sample, decontaminated=False) if not os.path.isfile(bamfilename): continue # if check_already_decontaminated(sample, fragment, PCR_sample): # continue fork_self(samplename, fragment, VERBOSE=VERBOSE, maxreads=maxreads, summary=summary, PCR=PCR_sample) sys.exit() for fragment in fragments: consensi = {refname: "".join(load_custom_reference(refname + "_" + fragment)) for refname in refnames} for samplename, sample in samples.iterrows(): sample = SamplePat(sample) try: consensi[samplename] = sample.get_consensus(fragment, PCR=1) except IOError: print samplename, "file not found" continue for samplename, sample in samples_focal.iterrows(): sample = SamplePat(sample) pname = sample.patient if PCR is None: PCRs_sample = (1, 2) else:
#if check_already_decontaminated(sample, fragment, PCR_sample): # continue fork_self(samplename, fragment, VERBOSE=VERBOSE, maxreads=maxreads, summary=summary, PCR=PCR_sample) sys.exit() for fragment in fragments: consensi = { refname: ''.join(load_custom_reference(refname + '_' + fragment)) for refname in refnames } for samplename, sample in samples.iterrows(): sample = SamplePat(sample) try: consensi[samplename] = sample.get_consensus(fragment, PCR=1) except IOError: print samplename, 'file not found' continue for samplename, sample in samples_focal.iterrows(): sample = SamplePat(sample) pname = sample.patient if PCR is None:
for pname, patient in patients.iterrows(): patient = Patient(patient) # Make maps for all annotations if not explicit if regions is None: patseqann = patient.get_reference('genomewide', format='gb') regionspat = map(attrgetter('id'), patseqann.features) + ['genomewide'] else: regionspat = regions for region in regionspat: if VERBOSE >= 1: print pname, region refseq = load_custom_reference(refname, format='gb', region=region) patseq = patient.get_reference(region) mapco = build_coordinate_map(refseq, patseq, VERBOSE=VERBOSE) mapco = np.array(mapco, int) shift_mapco(mapco, refname, region) maps_coord[(region, pname)] = mapco if save_to_file: out_fn = get_coordinate_map_filename(pname, region, refname=refname) np.savetxt(out_fn, mapco, fmt='%d',
regions = args.regions VERBOSE = args.verbose use_save = args.save use_joint = args.joint patients = load_patients() if pnames is not None: patients = patients.iloc[patients.index.isin(pnames)] pcodes = [p.code for _, p in patients.iterrows()] for region in regions: if use_joint: seqs_all = [] for refname in _refs: seq = load_custom_reference(refname, region=region) if refname == 'F10': refname = 'pZM246F_10' seq.id = 'Isolate_' + refname seq.name = 'Isolate_' + refname seq.description = 'Isolate: ' + refname if refname in ['38540', 'pZM246F_10']: seq.subtype = 'C' else: seq.subtype = 'B' seqs_all.append(seq) for pname, patient in iterpatient(patients): patient.discard_nonsequenced_samples() if VERBOSE >= 1: print region, pname
def annotate_sequence(seqrecord, additional_edges={}, additional_features=['chunk'], VERBOSE=0): '''Annotate a consensus with the genes and stuff (in place)''' # TODO: what do we do with genes that do not start/end where they are # supposed to? Do we follow biology and track their new locations? from Bio.SeqFeature import SeqFeature, FeatureLocation, CompoundLocation from hivwholeseq.utils.genome_info import gene_edges, RNA_structure_edges, \ other_edges, find_region_edges, find_region_edges_multiple, \ locate_gene edge_dict = {'gene': gene_edges, 'RNA structure': RNA_structure_edges, 'other': other_edges} edge_dict.update(additional_edges) additional_features = ['protein'] + additional_features features = edge_dict.keys() + additional_features if VERBOSE: print 'Features:', ', '.join(features) smat = np.array(seqrecord) for feature_type in edge_dict: edges_all = edge_dict[feature_type] print feature_type, edge_dict[feature_type].keys() for name, edges in edges_all.iteritems(): if VERBOSE >= 2: print name, # Skip a feature if it's present already if name in map(lambda x: x.id, seqrecord.features): if VERBOSE >= 2: print 'already present.' continue # Behave differently for unsplit regions and split ones if len(edges) == 2: # LTR problems with F6 if 'F6' in name: pos_edge = find_region_edges(smat[6000::], [edges[0], None]) pos_edge[0] += 6000 elif feature_type == 'genes': pos_edge = locate_gene(smat, name, output_compact=True) else: pos_edge = find_region_edges(smat, edges) # Cut the primers for some features if (None not in pos_edge) and name in ['V1', 'V3', 'V4', 'V5']: pos_edge[0] += len(edges[0]) pos_edge[1] -= len(edges[1]) # Cut only the right primer for V2 if (None not in pos_edge) and name in ['V2']: pos_edge[1] -= len(edges[1]) if pos_edge[0] is None: if name not in ['F1', "LTR5'"]: print 'WARNING: start not found' pos_edge[0] = 0 if pos_edge[1] is None: if name not in ['F6', "LTR3'"]: print 'WARNING: end not found' pos_edge[1] = len(smat) location = FeatureLocation(*pos_edge) else: if feature_type == 'genes': pos_edges = [locate_gene(smat, name+suff, output_compact=True) for suff in ('1', '2')] else: pos_edges = find_region_edges_multiple(smat, edges, min_distance=1) locations = [FeatureLocation(*pos_edge) for pos_edge in pos_edges] location = CompoundLocation(locations) if VERBOSE >= 2: print 'found:', location feature = SeqFeature(location, type=feature_type, id=name, strand=1) seqrecord.features.append(feature) # Add proteins and other features from HXB2 from operator import attrgetter from seqanpy import align_overlap from hivwholeseq.utils.genome_info import proteins, chunks from hivwholeseq.reference import load_custom_reference additional_features_dict = {} if 'protein' in additional_features: additional_features_dict['protein'] = proteins if 'chunk' in additional_features: additional_features_dict['chunk'] = chunks ref_ann = load_custom_reference('HXB2', 'gb') for feagroup, additional_features_grp in additional_features_dict.iteritems(): for feaname in additional_features_grp: if VERBOSE >= 2: print feaname, fea = ref_ann.features[map(attrgetter('id'), ref_ann.features).index(feaname)] seq = fea.extract(ref_ann) (score, ali1, ali2) = align_overlap(seqrecord, seq, score_gapopen=-20) start = len(ali2) - len(ali2.lstrip('-')) end = len(ali2.rstrip('-')) end -= ali1[start: end].count('-') location = FeatureLocation(start, end) if VERBOSE >= 2: print 'found:', location feature = SeqFeature(location, type=feagroup, id=feaname, strand=1) seqrecord.features.append(feature)
def annotate_sequence(seqrecord, additional_edges={}, additional_features=['chunk'], VERBOSE=0): '''Annotate a consensus with the genes and stuff (in place)''' # TODO: what do we do with genes that do not start/end where they are # supposed to? Do we follow biology and track their new locations? from Bio.SeqFeature import SeqFeature, FeatureLocation, CompoundLocation from hivwholeseq.utils.genome_info import gene_edges, RNA_structure_edges, \ other_edges, find_region_edges, find_region_edges_multiple, \ locate_gene edge_dict = { 'gene': gene_edges, 'RNA structure': RNA_structure_edges, 'other': other_edges } edge_dict.update(additional_edges) additional_features = ['protein'] + additional_features features = edge_dict.keys() + additional_features if VERBOSE: print 'Features:', ', '.join(features) smat = np.array(seqrecord) for feature_type in edge_dict: edges_all = edge_dict[feature_type] print feature_type, edge_dict[feature_type].keys() for name, edges in edges_all.iteritems(): if VERBOSE >= 2: print name, # Skip a feature if it's present already if name in map(lambda x: x.id, seqrecord.features): if VERBOSE >= 2: print 'already present.' continue # Behave differently for unsplit regions and split ones if len(edges) == 2: # LTR problems with F6 if 'F6' in name: pos_edge = find_region_edges(smat[6000::], [edges[0], None]) pos_edge[0] += 6000 elif feature_type == 'genes': pos_edge = locate_gene(smat, name, output_compact=True) else: pos_edge = find_region_edges(smat, edges) # Cut the primers for some features if (None not in pos_edge) and name in ['V1', 'V3', 'V4', 'V5']: pos_edge[0] += len(edges[0]) pos_edge[1] -= len(edges[1]) # Cut only the right primer for V2 if (None not in pos_edge) and name in ['V2']: pos_edge[1] -= len(edges[1]) if pos_edge[0] is None: if name not in ['F1', "LTR5'"]: print 'WARNING: start not found' pos_edge[0] = 0 if pos_edge[1] is None: if name not in ['F6', "LTR3'"]: print 'WARNING: end not found' pos_edge[1] = len(smat) location = FeatureLocation(*pos_edge) else: if feature_type == 'genes': pos_edges = [ locate_gene(smat, name + suff, output_compact=True) for suff in ('1', '2') ] else: pos_edges = find_region_edges_multiple(smat, edges, min_distance=1) locations = [ FeatureLocation(*pos_edge) for pos_edge in pos_edges ] location = CompoundLocation(locations) if VERBOSE >= 2: print 'found:', location feature = SeqFeature(location, type=feature_type, id=name, strand=1) seqrecord.features.append(feature) # Add proteins and other features from HXB2 from operator import attrgetter from seqanpy import align_overlap from hivwholeseq.utils.genome_info import proteins, chunks from hivwholeseq.reference import load_custom_reference additional_features_dict = {} if 'protein' in additional_features: additional_features_dict['protein'] = proteins if 'chunk' in additional_features: additional_features_dict['chunk'] = chunks ref_ann = load_custom_reference('HXB2', 'gb') for feagroup, additional_features_grp in additional_features_dict.iteritems( ): for feaname in additional_features_grp: if VERBOSE >= 2: print feaname, fea = ref_ann.features[map(attrgetter('id'), ref_ann.features).index(feaname)] seq = fea.extract(ref_ann) (score, ali1, ali2) = align_overlap(seqrecord, seq, score_gapopen=-20) start = len(ali2) - len(ali2.lstrip('-')) end = len(ali2.rstrip('-')) end -= ali1[start:end].count('-') location = FeatureLocation(start, end) if VERBOSE >= 2: print 'found:', location feature = SeqFeature(location, type=feagroup, id=feaname, strand=1) seqrecord.features.append(feature)
help='Reference to use for alignment') parser.add_argument('--verbose', type=int, default=0, help='Verbosity level [0-4]') parser.add_argument('--subtypes', nargs='+', default=['B'], help='Subtypes to keep') args = parser.parse_args() regions = args.regions refname = args.reference VERBOSE = args.verbose subtypes = args.subtypes from hivwholeseq.reference import load_custom_reference from hivwholeseq.utils.sequence import find_annotation ref = load_custom_reference('HXB2', 'gb') for region in regions: regm = np.array(find_annotation(ref, region).extract(ref), 'S1') for subtype in subtypes: fn = get_subtype_reference_alignment_filename(region, subtype=subtype, refname=refname, VERBOSE=VERBOSE) alim = np.array(AlignIO.read(fn, 'fasta'), 'S1') weird = ((alim != regm).mean(axis=1) > 0.2) print region, subtype, weird.sum()
parser.add_argument('--reference', required=True, help='Reference to analyze (e.g. LAI-III)') parser.add_argument('--fragments', nargs='+', default=fragments, help='Fragments to merge') parser.add_argument('--save', action='store_true', help='Save to file') parser.add_argument('--verbose', type=int, default=0, help='Verbosity level [0-3]') args = parser.parse_args() refname = args.reference use_save = args.save VERBOSE = args.verbose consensi = [load_custom_reference(refname+'_'+fr) for fr in fragments] consensus = merge_sequences(consensi, VERBOSE=VERBOSE) from Bio.Seq import Seq from Bio.SeqRecord import SeqRecord consrec = SeqRecord(Seq(consensus, alphabet=consensi[0].seq.alphabet), id=refname+'_genomewide', name=refname+'_genomewide', description=refname+', genomewide reference (merged)', ) if use_save: if VERBOSE >= 1: print 'Save to file' from Bio import SeqIO fn = get_custom_reference_filename(refname, format='fasta')
def make_reference(data_folder, adaID, fragments, refname, VERBOSE=0, summary=True): '''Make reference sequence trimmed to the necessary parts''' from hivwholeseq.reference import load_custom_reference seq = load_custom_reference(refname) output_filename = get_reference_premap_filename(data_folder, adaID) if fragments is None: seq_trim = seq else: # Look for the first fwd and the last rev primers to trim the reference # NOTE: this works even if F1 or F6 are missing (e.g. only F2-5 are seq-ed)! # If more than one primer is used for the first or last fragment, take the # longest reference from hivwholeseq.data.primers import primers_PCR, primers_coordinates_HXB2 if '+' in fragments[0]: fragment_subs = [ fragments[0][:2] + fsub + fragments[0][-1] for fsub in fragments[0][2:-1].split('+') ] fr_pos_subs = [ primers_coordinates_HXB2[fsub][0][0] for fsub in fragment_subs ] fragments[0] = fragment_subs[np.argmin(fr_pos_subs)] pr_fwd = primers_PCR[fragments[0]][0] if '+' in fragments[-1]: fragment_subs = [ fragments[-1][:2] + fsub + fragments[-1][-1] for fsub in fragments[-1][2:-1].split('+') ] fr_pos_subs = [ primers_coordinates_HXB2[fsub][1][1] for fsub in fragment_subs ] fragments[-1] = fragment_subs[np.argmax(fr_pos_subs)] pr_rev = primers_PCR[fragments[-1]][1] smat = np.array(seq) # Get all possible primers from ambiguous nucleotides and get the best match from hivwholeseq.utils.sequence import expand_ambiguous_seq as eas pr_fwd_mat = np.array(map(list, eas(pr_fwd)), 'S1') n_matches_fwd = [ (smat[i:i + len(pr_fwd)] == pr_fwd_mat).sum(axis=1).max() for i in xrange(len(seq) - len(pr_fwd)) ] pr_fwd_pos = np.argmax(n_matches_fwd) pr_rev_mat = np.array(map(list, eas(pr_rev)), 'S1') n_matches_rev = [ (smat[i:i + len(pr_rev)] == pr_rev_mat).sum(axis=1).max() for i in xrange(pr_fwd_pos + len(pr_fwd), len(seq) - len(pr_rev)) ] # Here you come from the right, i.e. look in the 3' LTR first pr_rev_pos = len(seq) - len(pr_rev) - 1 - np.argmax( n_matches_rev[::-1]) output = [['Reference name:', refname]] output.append(['FWD primer:', fragments[0], str(pr_fwd_pos), pr_fwd]) output.append(['REV primer:', fragments[-1], str(pr_rev_pos), pr_rev]) output = '\n'.join(map(' '.join, output)) if VERBOSE: print output if summary: with open(get_premap_summary_filename(data_folder, adaID), 'a') as f: f.write(output) f.write('\n') # The reference includes both the first fwd primer and the last rev one seq_trim = seq[pr_fwd_pos:pr_rev_pos + len(pr_rev)] seq_trim.id = '_'.join( [seq_trim.id, str(pr_fwd_pos + 1), str(pr_rev_pos + len(pr_rev))]) seq_trim.name = '_'.join([ seq_trim.name, str(pr_fwd_pos + 1), str(pr_rev_pos + len(pr_rev)) ]) seq_trim.description = ' '.join([ seq_trim.description, 'from', str(pr_fwd_pos + 1), 'to', str(pr_rev_pos + len(pr_rev)), '(indices from 1, extremes included)' ]) SeqIO.write(seq_trim, output_filename, 'fasta') if summary: with open(get_premap_summary_filename(data_folder, adaID), 'a') as f: f.write('Reference sequence written to: ' + output_filename) f.write('\n')
from hivwholeseq.utils.genome_info import all_edges, find_region_edges edges_chunk = all_edges[name] edges = find_region_edges(smat, edges_chunk) # Some features must be stripped of primers if name in ['V3']: edges[0] += len(edges_chunk[0]) edges[1] -= len(edges_chunk[1]) return [edges] # Script if __name__ == '__main__': seqold = load_custom_reference('HXB2', 'gb') seqnew = load_custom_reference('HXB2', 'fasta') smat = np.array(seqnew) print 'Add features' for typ, coord_typ in coordinates.iteritems(): for name, edges in coord_typ.iteritems(): # If coordinates are missing, grab primers if edges is None: edges = get_coordinates_feature(smat, name) if len(edges) == 1: fea = SeqFeature(FeatureLocation(edges[0][0], edges[0][1], strand=+1),
def correlate_epitope_substitution(ds, dctl): '''Correlate presence of a substitution with epitope''' from hivwholeseq.data.primers import primers_coordinates_HXB2_outer start_F1 = primers_coordinates_HXB2_outer['F1'][0][1] end_F6 = primers_coordinates_HXB2_outer['F6'][1][0] ds = ds.copy() dg = [] for pcode, datum in dctl.groupby('pcode'): a = np.arange(start_F1, end_F6) b = np.zeros(len(a), bool) for _, epi in datum.iterrows(): b[(a >= epi['start_HXB2']) & (a < epi['end_HXB2'])] = True c = np.zeros(len(a), bool) datum = ds.loc[ds['pcode'] == pcode] # Keep only nonsyn substitutions datum = datum.loc[datum['syn'] == False] c[datum['pos_ref'] - a[0]] = True dat = { 'pos': a, 'epitope': b, 'substitution': c, } dat = pd.DataFrame(dat) dat['pcode'] = pcode dg.append(dat) dg = pd.concat(dg) # Exclude env because it has antibody-related substitutions from hivwholeseq.reference import load_custom_reference from hivwholeseq.utils.sequence import find_annotation ref = load_custom_reference('HXB2', 'gb') start_env = find_annotation(ref, 'gp41').location.nofuzzy_start end_env = find_annotation(ref, 'gp41').location.nofuzzy_end - 450 dg = dg.loc[(dg['pos'] < start_env) | (dg['pos'] >= end_env)] M = dg.groupby(['epitope', 'substitution']).size().unstack() Ma = np.array(M) xp = 1.0 * Ma[1, 0] / Ma[0, 0] * Ma[0, 1] xs = Ma[1, 1] - xp print M from scipy.stats import fisher_exact print 'Fisher\'s exact enrichment:', fisher_exact(Ma)[0] print 'Fisher\'s exact P value:', fisher_exact(Ma)[1] print 'expected:', xp print 'excess:', xs, 'per patient:', xs / 9.0 pos_epi = dg.loc[dg['epitope'] == True]['pos'].unique() dg2 = dg.loc[dg['pos'].isin(pos_epi)].copy() M2 = dg2.groupby(['epitope', 'substitution']).size().unstack() M2a = np.array(M2) xp = 1.0 * M2a[1, 0] / M2a[0, 0] * M2a[0, 1] xs = M2a[1, 1] - xp print M2 print '\nFisher\'s exact enrichment:', fisher_exact(M2a)[0] print 'Fisher\'s exact P value:', fisher_exact(M2a)[1] print 'expected:', xp print 'excess:', xs, 'per patient:', xs / 9.0 return { 'dg': dg, 'dg2': dg2, }
parser.add_argument('--reference', default='HXB2', help='Reference to use for alignment') parser.add_argument('--verbose', type=int, default=0, help='Verbosity level [0-4]') parser.add_argument('--subtypes', nargs='+', default=['B'], help='Subtypes to keep') args = parser.parse_args() regions = args.regions refname = args.reference VERBOSE = args.verbose subtypes = args.subtypes from hivwholeseq.reference import load_custom_reference from hivwholeseq.utils.sequence import find_annotation ref = load_custom_reference('HXB2', 'gb') for region in regions: regm = np.array(find_annotation(ref, region).extract(ref), 'S1') for subtype in subtypes: fn = get_subtype_reference_alignment_filename(region, subtype=subtype, refname=refname, VERBOSE=VERBOSE) alim = np.array(AlignIO.read(fn, 'fasta'), 'S1') weird = ((alim != regm).mean(axis=1) > 0.2) print region, subtype, weird.sum()
parser.add_argument('--fragments', nargs='+', default=fragments, help='Fragments to merge') parser.add_argument('--save', action='store_true', help='Save to file') parser.add_argument('--verbose', type=int, default=0, help='Verbosity level [0-3]') args = parser.parse_args() refname = args.reference use_save = args.save VERBOSE = args.verbose consensi = [load_custom_reference(refname + '_' + fr) for fr in fragments] consensus = merge_sequences(consensi, VERBOSE=VERBOSE) from Bio.Seq import Seq from Bio.SeqRecord import SeqRecord consrec = SeqRecord( Seq(consensus, alphabet=consensi[0].seq.alphabet), id=refname + '_genomewide', name=refname + '_genomewide', description=refname + ', genomewide reference (merged)', ) if use_save: if VERBOSE >= 1: print 'Save to file' from Bio import SeqIO