def get_other_consensi_seqrun(dataset, samplename, fragment, VERBOSE=0): '''Get consensi of other samples except the focal one''' consensi = [] for (samplename_other, sample) in dataset.samples.iterrows(): if samplename_other == samplename: continue sample = SampleSeq(sample) ref_fn = sample.get_consensus_filename(fragment) if not os.path.isfile(ref_fn): if VERBOSE >= 3: print samplename_other+': consensus for fragment '+fragment+' not found, skipping' continue consensi.append(SeqIO.read(ref_fn, 'fasta')) return consensi
def get_other_consensi_seqrun(dataset, samplename, fragment, VERBOSE=0): '''Get consensi of other samples except the focal one''' consensi = [] for (samplename_other, sample) in dataset.samples.iterrows(): if samplename_other == samplename: continue sample = SampleSeq(sample) ref_fn = sample.get_consensus_filename(fragment) if not os.path.isfile(ref_fn): if VERBOSE >= 3: print samplename_other + ': consensus for fragment ' + fragment + ' not found, skipping' continue consensi.append(SeqIO.read(ref_fn, 'fasta')) return consensi
def complement_consensus_PCR2(cons_rec, patient, fragment, samplen, VERBOSE=0): '''Complement consensus from PCR2 with wings from later PCR1 sample''' from hivwholeseq.utils.sequence import find_seed_imperfect, rfind_seed_imperfect found = False for _, sampletmp in patient.samples.iloc[samplen + 1:].iterrows(): for _, sampleseqtmp in sampletmp['samples seq'].iterrows(): sampleseqtmp = SampleSeq(sampleseqtmp) if int(sampleseqtmp.PCR) == 1: sampleseq_later = sampleseqtmp found = True break if found: break adaID_later = sampleseq_later['adapter'] data_folder_later = sampleseq_later.sequencing_run.folder cons_rec_later = SeqIO.read( get_consensus_filename(data_folder_later, adaID_later, fragment), 'fasta') conss_later = str(cons_rec_later.seq) start = find_seed_imperfect(cons_rec_later, cons_rec[:20]) end = rfind_seed_imperfect(cons_rec_later, cons_rec[-20:]) + 20 if VERBOSE >= 1: print 'Complementing PCR2 consensus with later PCR1:', print sampleseq_later.name, sampleseq_later[ 'seq run'], sampleseq_later.adapter frag_spec = sampleseq_later.regions_complete[ sampleseq_later.regions_generic.index(fragment)] return (frag_spec, conss_later[:start] + cons_rec + conss_later[end:])
# Specify the dataset dataset = load_sequencing_run(seq_run) data_folder = dataset.folder # If the script is called with no adaID, iterate over all dataset.discard_nondivided_samples() samples = dataset.samples if adaIDs is not None: samples = samples.loc[samples.adapter.isin(adaIDs)] if VERBOSE >= 3: print 'adaIDs', samples.adapter # Iterate over all requested samples for samplename, sample in samples.iterrows(): sample = SampleSeq(sample) adaID = sample.adapter # If the script is called with no fragment, iterate over all if not fragments: fragments_sample = sample.regions_complete else: from re import findall fragments_all = sample.regions_complete fragments_sample = [] for fragment in fragments: frs = filter(lambda x: fragment in x, fragments_all) if len(frs): fragments_sample.append(frs[0]) if 'genomewide' in fragments: fragments_sample.append('genomewide')
if adaIDs is not None: samples = samples.loc[samples.adapter.isin(adaIDs)] if use_pats: samples = samples.loc[samples['patient sample'] != 'nan'] else: samples = samples.loc[samplenames] if fragments is None: fragments = ['F'+str(i+1) for i in xrange(6)] alis = {fr: AlignIO.read(get_consensi_alignment_filename('all', fr), 'fasta') for fr in fragments} for samplename, sample in samples.iterrows(): sample = SampleSeq(sample) data_folder = sample.seqrun_folder adaID = sample.adapter pname = sample.patientname for fragment in fragments: if VERBOSE >= 1: print sample['seq run'], adaID, fragment, samplename, # Read the summary filename of the filter_mapped, and find out whether # there are many distant reads (a few are normal) fn = get_filter_mapped_summary_filename(data_folder, adaID, fragment) if os.path.isfile(fn): found = False with open(fn, 'r') as f: for line in f:
samples = load_samples_sequenced() if pnames is not None: samples = samples.loc[samples.patient.isin(pnames)] elif samplenames is not None: samples = samples.loc[samples.index.isin(samplenames)] for samplename, sample in samples.iterrows(): sample = SamplePat(sample) if VERBOSE >= 1: print samplename dist_hists = [] samples_seq = sample.get_sequenced_samples() samples_seq = samples_seq.loc[samples_seq.PCR == 1] for samplename_seq, sample_seq in samples_seq.iterrows(): sample_seq = SampleSeq(sample_seq) data_folder = sample_seq.seqrun_folder adaID = sample_seq.adapter for fragment in fragments: try: dist_hist = get_distance_histogram(data_folder, adaID, fragment, VERBOSE=VERBOSE) except IOError: continue dist_hists.append((samplename_seq, fragment, dist_hist)) dist_hists.sort(key=itemgetter(1))
samplenames_pat = samples_pat.index[ind] samples_seq = samples_seq.loc[samples_seq['patient sample'].isin(samplenames_pat)] else: samples_seq = samples_seq.loc[samples_seq.index.isin(samplenames)] if VERBOSE >= 2: print 'samples', samples_seq.index.tolist() # If the script is called with no fragment, iterate over all if not fragments: fragments = ['F'+str(i) for i in xrange(1, 7)] if VERBOSE >= 3: print 'fragments', fragments for samplename, sample in samples_seq.iterrows(): sample = SampleSeq(sample) samplename_pat = sample['patient sample'] sample_pat = samples_pat.loc[samplename_pat] sample['patient'] = pname = sample_pat.patient PCR = int(sample.PCR) fragments_sample = sorted(set(sample.regions_generic) & set(fragments)) if VERBOSE: print samplename, samplename_pat, pname, PCR if not skip_hash: make_output_folders(pname, samplename_pat, PCR=PCR, VERBOSE=VERBOSE) for fragment in fragments_sample: if VERBOSE:
data_folder = dataset.folder samples = dataset.samples if not fragments: fragments = ['F' + str(i + 1) for i in xrange(6)] matrices = {} for fragment in fragments: if VERBOSE: print fragment samples_frag = samples.loc[[ os.path.isfile( SampleSeq(s).get_mapped_filename(fragment, type='bam', filtered=False)) for sn, s in samples.iterrows() ]] n_samples = len(samples_frag) consensi = [ SeqIO.read(SampleSeq(s).get_consensus_filename(fragment), 'fasta') for sn, s in samples_frag.iterrows() ] labels = [(sn, s.adapter) for sn, s in samples_frag.iterrows()] m = np.zeros((n_samples, n_samples), int) for si, (samplename, sample) in enumerate(samples_frag.iterrows()): if VERBOSE == 1: print samplename,
patient.discard_nonsequenced_samples() mkdirs(get_initial_reference_foldername(pname)) if not fragments: fragments = ['F' + str(i) for i in xrange(1, 7)] if VERBOSE >= 3: print 'fragments', fragments if samplename is None: sample = SamplePat(patient.samples.iloc[samplen]) else: sample = load_sample_sequenced(samplename) for fragment in fragments: sample_seq = SampleSeq(sample.samples_seq.iloc[repn]) seq_run = sample_seq['seq run'] adaID = sample_seq['adapter'] dataset = sample_seq.sequencing_run data_folder = dataset.folder if VERBOSE: print 'Initial sample:', sample_seq.name, sample_seq['seq run'], print sample_seq.adapter cons_rec = SeqIO.read( get_consensus_filename(data_folder, adaID, fragment), 'fasta') frag_spec = sample_seq.regions_complete[\ sample_seq.regions_generic.index(fragment)]
savefig = args.savefig only_filt = args.only_filt # Specify the dataset dataset = load_sequencing_run(seq_run) data_folder = dataset['folder'] # If the script is called with no adaID, iterate over all if not adaIDs: adaIDs = dataset['adapters'] if VERBOSE >= 3: print 'adaIDs', adaIDs # Iterate over all requested samples for adaID in adaIDs: # If the script is called with no fragment, iterate over all sample = SampleSeq(dataset.samples.loc[dataset.samples.adapter == adaID].iloc[0]) samplename = sample.name if not fragments: fragments_sample = sample.regions_generic else: fragments_sample = sorted(set(fragments) & set(sample.regions_generic)) if not only_filt: plot_minor_allele_frequency(data_folder, adaID, fragments_sample, VERBOSE=VERBOSE, savefig=savefig) else: plot_minor_allele_frequency_filtered(data_folder, adaID, fragments_sample, VERBOSE=VERBOSE, savefig=savefig)