def run(acc, splitNum, splitNo): # open requested accession using SRA implementation of the API with NGS.openReadCollection(acc) as run: run_name = run.getName() # compute window to iterate through MAX_ROW = run.getReadCount() chunk = MAX_ROW / splitNum first = int(round(chunk * (splitNo - 1))) next_first = int(round(chunk * (splitNo))) if next_first > MAX_ROW: next_first = MAX_ROW # start iterator on reads with run.getReadRange(first + 1, next_first - first, Read.all) as it: i = 0 while it.nextRead(): i += 1 print(it.getReadId()) # iterate through fragments while it.nextFragment(): bases = it.getFragmentBases() if bases: print("\t{} - {}".format( bases, "aligned" if it.isAligned() else "unaligned")) print("\n") print("Read {} spots for {}".format(i, run_name))
def run(acc, splitNum, splitNo): # open requested accession using SRA implementation of the API with NGS.openReadCollection(acc) as run: run_name = run.getName() # compute window to iterate through MAX_ROW = run.getReadCount() chunk = MAX_ROW/splitNum first = int(round(chunk*(splitNo-1))) next_first = int(round(chunk*(splitNo))) if next_first > MAX_ROW: next_first = MAX_ROW # start iterator on reads with run.getReadRange(first+1, next_first-first, Read.all) as it: i = 0 while it.nextRead(): i += 1 print (it.getReadId()) # iterate through fragments while it.nextFragment(): bases = it.getFragmentBases() if bases: print ("\t{} - {}".format(bases, "aligned" if it.isAligned() else "unaligned")) print ("\n") print ("Read {} spots for {}".format(i, run_name))
def run(acc, splitNum, splitNo): # open requested accession using SRA implementation of the API with NGS.openReadCollection(acc) as run: run_name = run.getName() # compute window to iterate through MAX_ROW = run.getAlignmentCount() chunk = MAX_ROW / splitNum first = int(round(chunk * (splitNo-1))) next_first = int(round(chunk * (splitNo))) if next_first > MAX_ROW: next_first = MAX_ROW # start iterator on reads with run.getAlignmentRange(first+1, next_first-first, Alignment.primaryAlignment) as it: i = 0 while it.nextAlignment(): print ("{}\t{}\t{}\t{}\t{}\t{}".format(it.getReadId(), it.getReferenceSpec(), it.getAlignmentPosition(), it.getShortCigar(False), it.getFragmentBases(), ("aligned" if it.isAligned() else "unaligned"), )) i += 1 print ("Read {} alignments for {}".format(i, run_name))
def sra_reader(accn, batch_size=1000, max_reads=None): """Iterates through a read collection for a given accession number using the ngs-lib python bindings. Args: accn: The accession number batch_size: The maximum number of reads to request in each call to SRA max_reads: The total number of reads to process, or all reads in the SRA run if None Yields: Each pair of reads (see ``sra_read_pair``) """ with NGS.openReadCollection(accn) as run: run_name = run.getName() read_count = run.getReadCount() if max_reads: max_reads = min(read_count, max_reads) else: max_reads = read_count for batch_num, first_read in enumerate(range(1, max_reads, batch_size)): cur_batch_size = min(batch_size, max_reads - first_read + 1) with run.getReadRange(first_read, cur_batch_size, Read.all) as read: for read_idx in range(cur_batch_size): read.nextRead() yield sra_read_pair(read)
def test_ReadGroupIterator_ThrowBeforeNext(self): it = NGS.openReadCollection(PrimaryOnly).getReadGroups() try: it.getName() self.fail() except ErrorMsg: pass
def run(acc, splitNum, splitNo): # this function doesn't release NGS objects however it might # open requested accession using SRA implementation of the API run = NGS.openReadCollection(acc) run_name = run.getName() # compute window to iterate through MAX_ROW = run.getReadCount() chunk = MAX_ROW/splitNum first = int(round(chunk*(splitNo-1))) next_first = int(round(chunk*(splitNo))) if next_first > MAX_ROW: next_first = MAX_ROW # start iterator on reads it = run.getReadRange(first+1, next_first-first, Read.all) i = 0 while it.nextRead(): i += 1 print (it.getReadId()) # iterate through fragments while it.nextFragment(): bases = it.getFragmentBases() if bases: print ("\t" + bases + " - " + ("aligned" if it.isAligned() else "unaligned")) print ("\n") print ("Read {} spots for {}".format(i, run_name))
def run(acc, splitNum, splitNo): # this function doesn't release NGS objects however it might # open requested accession using SRA implementation of the API run = NGS.openReadCollection(acc) run_name = run.getName() # compute window to iterate through MAX_ROW = run.getAlignmentCount() chunk = MAX_ROW / splitNum first = int(round(chunk * (splitNo - 1))) next_first = int(round(chunk * (splitNo))) if next_first > MAX_ROW: next_first = MAX_ROW # start iterator on reads it = run.getAlignmentRange(first + 1, next_first - first, Alignment.primaryAlignment) i = 0 while it.nextAlignment(): print( it.getReadId() + "\t" + it.getReferenceSpec() + "\t" + str(it.getAlignmentPosition()) + "\t" + it.getShortCigar(False) + "\t" + it.getFragmentBases() + "\t" + ("aligned" if it.isAligned() else "unaligned") ) i += 1 print("Read {} alignments for {}".format(i, run_name))
def run(acc, splitNum, splitNo): # this function doesn't release NGS objects however it might # open requested accession using SRA implementation of the API run = NGS.openReadCollection(acc) run_name = run.getName() # compute window to iterate through MAX_ROW = run.getReadCount() chunk = MAX_ROW / splitNum first = int(round(chunk * (splitNo - 1))) next_first = int(round(chunk * (splitNo))) if next_first > MAX_ROW: next_first = MAX_ROW # start iterator on reads it = run.getReadRange(first + 1, next_first - first, Read.all) i = 0 while it.nextRead(): i += 1 print(it.getReadId()) # iterate through fragments while it.nextFragment(): bases = it.getFragmentBases() if bases: print("\t" + bases + " - " + ("aligned" if it.isAligned() else "unaligned")) print("\n") print("Read {} spots for {}".format(i, run_name))
def run(acc, splitNum, splitNo): # this function doesn't release NGS objects however it might # open requested accession using SRA implementation of the API run = NGS.openReadCollection(acc) run_name = run.getName() # compute window to iterate through MAX_ROW = run.getAlignmentCount() chunk = MAX_ROW / splitNum first = int(round(chunk * (splitNo-1))) next_first = int(round(chunk * (splitNo))) if next_first > MAX_ROW: next_first = MAX_ROW # start iterator on reads it = run.getAlignmentRange(first+1, next_first-first, Alignment.primaryAlignment) i = 0 while it.nextAlignment(): print ( it.getReadId() + "\t" + it.getReferenceSpec() + "\t" + str(it.getAlignmentPosition()) + "\t" + it.getShortCigar(False) + "\t" + it.getFragmentBases() ) i += 1 print ("Read {} alignments for {}".format(i, run_name))
def stream_reads(self, acc, event, splitNum=1, splitNo=1): ''' This is a blocking task, it needs to be run in an executor ''' # open requested accession using SRA implementation of the API print(f'Streaming {acc}', file=sys.stderr) pipe_path = self.get_pipe(acc) pipe = open(pipe_path, 'w') event.clear() with NGS.openReadCollection(acc) as run: run_name = run.getName() # compute window to iterate through MAX_ROW = run.getReadCount() chunk = MAX_ROW/splitNum first = int(round(chunk*(splitNo-1))) next_first = int(round(chunk*(splitNo))) if next_first > MAX_ROW: next_first = MAX_ROW # start iterator on reads with run.getReadRange(first+1, next_first-first, Read.all) as it: i = 0 while it.nextRead(): i += 1 if i > 20000: break while it.nextFragment(): bases = it.getFragmentBases() qualities=it.getFragmentQualities() ids=it.getFragmentId() if bases: read = f'@{ids}\n{bases}\n+\n{qualities}' print(read,file=pipe) os.unlink(pipe_path) print(f'Done streaming for {acc}') return None
def run(acc, splitNum, splitNo): # open requested accession using SRA implementation of the API with NGS.openReadCollection(acc) as run: run_name = run.getName() # compute window to iterate through MAX_ROW = run.getAlignmentCount() chunk = MAX_ROW / splitNum first = int(round(chunk * (splitNo - 1))) next_first = int(round(chunk * (splitNo))) if next_first > MAX_ROW: next_first = MAX_ROW # start iterator on reads with run.getAlignmentRange(first + 1, next_first - first, Alignment.primaryAlignment) as it: i = 0 while it.nextAlignment(): print("{}\t{}\t{}\t{}\t{}".format( it.getReadId(), it.getReferenceSpec(), it.getAlignmentPosition(), it.getShortCigar(False), it.getFragmentBases(), )) i += 1 print("Read {} alignments for {}".format(i, run_name))
def run(acc, refName, start, stop): # open requested accession using SRA implementation of the API with NGS.openReadCollection(acc) as run: run_name = run.getName() # get requested reference with run.getReference(refName) as ref: # start iterator on requested range with ref.getPileupSlice(start - 1, stop - start + 1) as it: i = 0 while it.nextPileup(): qual = "" base = "" line = "{}\t{}\t{}\t{}".format( it.getReferenceSpec(), it.getReferencePosition() + 1, it.getReferenceBase(), it.getPileupDepth() ) while it.nextPileupEvent(): e = it.getEventType() if (e & PileupEvent.alignment_start) != 0: base = base + "^" base = base + chr(it.getMappingQuality() + 33) if (e & PileupEvent.insertion) != 0: base = base + "+" ibases = it.getInsertionBases() c = len(ibases) base = base + str(c) if (e & PileupEvent.alignment_minus_strand) != 0: base = base + ibases.lower() else: base = base + ibases evt = e & 7 if (e & PileupEvent.alignment_minus_strand) != 0: if evt == PileupEvent.deletion: base = base + "<" elif evt == PileupEvent.match: base = base + "," elif evt == PileupEvent.mismatch: base = base + str(it.getAlignmentBase()).lower() else: if evt == PileupEvent.deletion: base = base + ">" elif evt == PileupEvent.match: base = base + "." elif evt == PileupEvent.mismatch: base = base + str(it.getAlignmentBase()).upper() if (e & PileupEvent.alignment_stop) != 0: base = base + "$" qual = qual + it.getAlignmentQuality() i += 1 print("{}\t{}\t{}".format(line, base, qual)) print("Read {} pileups for {}".format(i, run_name))
def test_ReferenceWindow_Slice_Filtered_Category (self): it = NGS.openReadCollection(WithSecondary).getReference("gi|169794206|ref|NC_010410.1|").getAlignmentSlice(516000, 100000, Alignment.primaryAlignment) self.assertTrue(it.nextAlignment()) self.assertEqual(WithSecondary + ".PA.33", it. getAlignmentId()) self.assertTrue(it.nextAlignment()) self.assertEqual(WithSecondary + ".PA.34", it. getAlignmentId()) self.assertTrue(it.nextAlignment()) self.assertEqual(WithSecondary + ".PA.35", it. getAlignmentId()) # no secondary self.assertFalse(it.nextAlignment())
def search_reads(request): """ Searches a genomic interval in the NCBI API and returns a list of converted GA4GH alignments Args: request: SearchReadsRequest. If `request.page_size` is set, up to this many records are returned. If not set, `_DEFAULT_PAGE_SIZE` is used as the page size. `request.start` can be overridden by providing a greater start position in `request.page_token`. If provided, `request.page_token` is parsed to a long and compared with `request.start`. In that case, the greater of the two is used as the zero-based inclusive interval start. Returns: Tuple: 1) List of converted alignments in GA4GH schema 2) Maximum zero-based exclusive alignment end position over all alignments returned. This value can be set as request.page_token (after parsing to a string) for a subsequent request; in that case, streaming will pick up where it left off after this request. """ # We are assuming the read group IDs are singleton run_accession = request.read_group_ids[0] reference_name = request.reference_id # Choose the start position between request.start and request.page_token try: start = max(long(request.page_token), request.start) except ValueError: start = request.start end = request.end # Number of alignments to get if request.page_size < 1: num_aligns = _DEFAULT_PAGE_SIZE else: num_aligns = request.page_size alignments = [] max_aligned_pos = 0 # Keep track of max zero-based exclusive alignment end # open requested accession using SRA implementation of the API with NGS.openReadCollection(run_accession) as run: # get requested reference with run.getReference(reference_name) as reference: # start iterator on requested range # We need to find out if it returns overlapping reads, or just # those that fit within the slice. with reference.getAlignmentSlice(start, end - start + 1, Alignment.primaryAlignment) as it: i = 0 while it.nextAlignment(): # Only get the requested number of alignments if i == num_aligns: break max_aligned_pos = max( max_aligned_pos, it.getAlignmentPosition() + it.getAlignmentLength()) ga_alignment = _convert_alignment(it) alignments.append(ga_alignment) i += 1 return (alignments, max_aligned_pos)
def test_ReadGroup_getStatistics(self): gr = NGS.openReadCollection(WithGroups).getReadGroup("GS57510-FS3-L03") stats = gr.getStatistics() self.assertEqual(34164461870, stats.getAsU64("BASE_COUNT")) self.assertEqual(34164461870, stats.getAsU64("BIO_BASE_COUNT")) self.assertEqual(488063741, stats.getAsU64("SPOT_COUNT")) self.assertEqual(5368875807, stats.getAsU64("SPOT_MAX")) self.assertEqual(4880812067, stats.getAsU64("SPOT_MIN"))
def test_ReferenceWindow_Slice_Filtered_Start_Within_Slice (self): ref = NGS.openReadCollection(WithCircularRef).getReference("NC_012920.1") it = ref.getFilteredAlignmentSlice(0, ref.getLength(), Alignment.all, Alignment.startWithinSlice, 0) self.assertTrue(it.nextAlignment()) lastAlignmentPosition = it.getAlignmentPosition() while it.nextAlignment(): currentPosition = it.getAlignmentPosition() errorMsg = "Sorting violated. Last position (" + str(lastAlignmentPosition) + ") is higher than current one (" + str(currentPosition) + ")" self.assertTrue ( lastAlignmentPosition <= currentPosition, errorMsg ) lastAlignmentPosition = currentPosition
def test_Alignment_isPaired_MultiFragmentsPerSpot(self): readCollection = NGS.openReadCollection(PrimaryOnly) alignment = readCollection.getAlignment(PrimaryOnly + ".PA.1") self.assertTrue(alignment.isPaired()) alignment = readCollection.getAlignment(PrimaryOnly + ".PA.2") self.assertTrue(alignment.isPaired()) # has unaligned mate alignment = readCollection.getAlignment (PrimaryOnly + ".PA.6") self.assertTrue(alignment.isPaired())
def run(acc, refName=None): # open requested accession using SRA implementation of the API with NGS.openReadCollection(acc) as run: if refName: with run.getReference(refName) as ref: process(ref) else: with run.getReferences() as refs: while refs.nextReference(): process(refs) print("")
def start(self): """Open the read collection. """ self.read_collection = NGS.openReadCollection(self.accn) self.run_name = self.read_collection.getName() self.read_count = self.read_collection.getReadCount() # grab the first read use it to determine whether the dataset # is single- or paired-end with self.read_collection.getReadRange(1, 1, Read.all) as read: read.nextRead() self.frag_count = len(sra_reads(read))
def run(acc): # this function doesn't release NGS objects however it might # open requested accession using SRA implementation of the API run = NGS.openReadCollection(acc) run_name = run.getName() # get requested reference it = run.getReferences() i = 0 while it.nextReference(): print(it.getCommonName() + "\t" + it.getCanonicalName() + "\t" + str(it.getLength()) + "\t" + ("circular" if it.getIsCircular() else "linear")) print("Read {} references for {}".format(i, run_name))
def main(): parser = argparse.ArgumentParser(description='produce FastQ using NGS') parser.add_argument('accession', default=None, type=str, help='accession to process') parser.add_argument('-s', '--start', default=1, type=int, help='first row to use') parser.add_argument('-n', '--count', default=10, type=int, help='number of rows to use') parser.add_argument('-p', '--split', default=False, action='store_true', help='split the READS') parser.add_argument('-r', '--random', default=False, action='store_true', help='get n random rows') args = parser.parse_args() if args.accession == None: print("accession missing!") else: try: with NGS.openReadCollection(args.accession) as run: if args.random: if args.split: src = random_fastq_split(run, args.count) else: src = random_fastq(run, args.count) else: if args.split: src = fastq_split(run, args.start, args.count) else: src = fastq(run, args.start, args.count) for read in src: printfastq(*read) except ErrorMsg as e: print("error: {}".format(e))
def run(acc): # open requested accession using SRA implementation of the API with NGS.openReadCollection(acc) as run: run_name = run.getName() # get requested reference with run.getReferences() as it: i = 0 while it.nextReference(): print ("{}\t{}\t{}\t{}".format(it.getCommonName(), it.getCanonicalName(), it.getLength(), "circular" if it.getIsCircular() else "linear", )) print ("Read {} references for {}".format(i, run_name))
def run(acc): # open requested accession using SRA implementation of the API with NGS.openReadCollection(acc) as run: run_name = run.getName() # get requested reference with run.getReferences() as it: i = 0 while it.nextReference(): print("{}\t{}\t{}\t{}".format( it.getCommonName(), it.getCanonicalName(), it.getLength(), "circular" if it.getIsCircular() else "linear", )) print("Read {} references for {}".format(i, run_name))
def run(acc): # this function doesn't release NGS objects however it might # open requested accession using SRA implementation of the API run = NGS.openReadCollection(acc) run_name = run.getName() # get requested reference it = run.getReferences() i = 0 while it.nextReference(): print ( it.getCommonName() + "\t" + it.getCanonicalName() + "\t" + str(it.getLength()) + "\t" + ("circular" if it.getIsCircular() else "linear") ) print ("Read {} references for {}".format(i, run_name))
def test_ReferenceWindow(self): it = NGS.openReadCollection(WithSecondary).getReference("gi|169794206|ref|NC_010410.1|").getAlignments(Alignment.all) self.assertTrue(it.nextAlignment()) # the first 2 secondary alignments' locations on the list: #34, #61 count = 1; while it.nextAlignment(): if it.getAlignmentCategory() == Alignment.secondaryAlignment: break count += 1 self.assertEqual(34, count) while it.nextAlignment(): if it.getAlignmentCategory() == Alignment.secondaryAlignment: break count += 1 self.assertEqual(61, count)
def run(acc, refName, start, stop): # open requested accession using SRA implementation of the API with NGS.openReadCollection(acc) as run: run_name = run.getName() # get requested reference with run.getReference(refName) as ref: # start iterator on requested range with ref.getAlignmentSlice(start, stop-start+1, Alignment.primaryAlignment) as it: i = 0 while it.nextAlignment(): print ("{}\t{}\t{}\t{}\t{}".format( it.getReadId(), it.getReferenceSpec(), it.getAlignmentPosition(), it.getLongCigar(False), it.getAlignedFragmentBases(), )) i += 1 print ("Read {} alignments for {}".format(i, run_name))
def run(acc, refName, start, stop): # this function doesn't release NGS objects however it might # open requested accession using SRA implementation of the API run = NGS.openReadCollection(acc) run_name = run.getName() # get requested reference ref = run.getReference(refName) # start iterator on requested range it = ref.getAlignmentSlice(start, stop-start+1, Alignment.primaryAlignment) i = 0 while it.nextAlignment(): print ("%s\t%s\t%d\t%s\t%s" % ( it.getReadId(), it.getReferenceSpec(), it.getAlignmentPosition(), it.getLongCigar(False), it.getAlignedFragmentBases(), )) i += 1 print ("Read %d alignments for %s" % (i, run_name))
def sra_reader(accn, batcher): """Iterates through a read collection for a given accession number using the ngs-lib python bindings. Args: accn: The accession number batch_size: The maximum number of reads to request in each call to SRA max_reads: The total number of reads to process, or all reads in the SRA run if None Yields: Each pair of reads (see ``sra_read_pair``) """ with NGS.openReadCollection(accn) as run: run_name = run.getName() read_count = run.getReadCount() for batch, start, size in batcher(read_count): with run.getReadRange(start + 1, size, Read.all) as read: for read_idx in range(size): read.nextRead() yield sra_read(read)
def run(acc, refName, start, stop): # this function doesn't release NGS objects however it might # open requested accession using SRA implementation of the API run = NGS.openReadCollection(acc) run_name = run.getName() # get requested reference ref = run.getReference(refName) # start iterator on requested range it = ref.getAlignmentSlice(start, stop - start + 1, Alignment.primaryAlignment) i = 0 while it.nextAlignment(): print("%s\t%s\t%d\t%s\t%s" % ( it.getReadId(), it.getReferenceSpec(), it.getAlignmentPosition(), it.getLongCigar(False), it.getAlignedFragmentBases(), )) i += 1 print("Read %d alignments for %s" % (i, run_name))
def test_ReadCollection_getName(self): self.assertEqual(PrimaryOnly, NGS.openReadCollection(PrimaryOnly).getName())
def test_ReadGroupIterator_Next(self): it = NGS.openReadCollection(PrimaryOnly).getReadGroups(); self.assertTrue(it.nextReadGroup()); name = it.getName();
def test_ReadCollection_getReadGroups(self): it = NGS.openReadCollection(PrimaryOnly).getReadGroups()
def test_ReadCollection_getReadGroup(self): gr = NGS.openReadCollection(PrimaryOnly).getReadGroup("C1ELY.6")
def test_ReadCollection_getReferences(self): it = NGS.openReadCollection(PrimaryOnly).getReferences()
def run(acc, refName, start, stop): # open requested accession using SRA implementation of the API with NGS.openReadCollection(acc) as run: run_name = run.getName() # get requested reference with run.getReference(refName) as ref: # start iterator on requested range with ref.getPileupSlice(start - 1, stop - start + 1) as it: i = 0 while it.nextPileup(): qual = "" base = "" line = "{}\t{}\t{}\t{}".format( it.getReferenceSpec(), it.getReferencePosition() + 1, it.getReferenceBase(), it.getPileupDepth(), ) while it.nextPileupEvent(): e = it.getEventType() if (e & PileupEvent.alignment_start) != 0: base = base + '^' base = base + chr(it.getMappingQuality() + 33) if (e & PileupEvent.insertion) != 0: base = base + '+' ibases = it.getInsertionBases() c = len(ibases) base = base + str(c) if (e & PileupEvent.alignment_minus_strand) != 0: base = base + ibases.lower() else: base = base + ibases evt = e & 7 if (e & PileupEvent.alignment_minus_strand) != 0: if evt == PileupEvent.deletion: base = base + '<' elif evt == PileupEvent.match: base = base + ',' elif evt == PileupEvent.mismatch: base = base + str( it.getAlignmentBase()).lower() else: if evt == PileupEvent.deletion: base = base + '>' elif evt == PileupEvent.match: base = base + '.' elif evt == PileupEvent.mismatch: base = base + str( it.getAlignmentBase()).upper() if (e & PileupEvent.alignment_stop) != 0: base = base + '$' qual = qual + it.getAlignmentQuality() i += 1 print("{}\t{}\t{}".format(line, base, qual)) print("Read {} pileups for {}".format(i, run_name))
def test_ReadCollection_getAlignmentCount_WithSecondary_Secondary(self): self.assertEqual(10, NGS.openReadCollection(WithSecondary).getAlignmentCount(Alignment.secondaryAlignment))
def test_ReadCollection_getAlignmentCount_WithSecondary_All(self): self.assertEqual(178, NGS.openReadCollection(WithSecondary).getAlignmentCount(Alignment.all))
def test_ReadCollection_getAlignmentCount_PrimaryOnly_Secondary(self): self.assertEqual(0, NGS.openReadCollection(PrimaryOnly).getAlignmentCount(Alignment.secondaryAlignment))
def test_ReadCollection_getAlignmentCount_PrimaryOnly_All(self): self.assertEqual(3987701, NGS.openReadCollection(PrimaryOnly).getAlignmentCount(Alignment.all))
def test_ReadCollection_getAlignments_all(self): alIt = NGS.openReadCollection(PrimaryOnly).getAlignments(Alignment.all)
def test_ReadCollection_getAlignments_Secondary(self): alIt = NGS.openReadCollection(PrimaryOnly).getAlignments(Alignment.secondaryAlignment)
def test_ReadCollection_getAlignment(self): al = NGS.openReadCollection(PrimaryOnly).getAlignment(PrimaryOnly + ".PA.1")
def test_ReadCollection_hasReference(self): assert ( NGS.openReadCollection(PrimaryOnly).hasReference("supercont2.1") ) assert ( not NGS.openReadCollection(PrimaryOnly).hasReference("non-existent acc") )
def test_ReadCollection_getReference(self): ref = NGS.openReadCollection(PrimaryOnly).getReference("supercont2.1")