def main(): parser = argparse.ArgumentParser( description='extract immune receptor sequences from Genbank records', formatter_class=argparse.ArgumentDefaultsHelpFormatter) # directory to store the batches in parser.add_argument('batch_dirname', metavar='dir', help='name for the batch directory') # input files parser.add_argument('genbank_filename', metavar='genbank-file', help='the file with the Genbank records') args = parser.parse_args() logging.basicConfig(level=logging.INFO) start_time = time.time() # check if the batch directory already exists if os.path.exists(args.batch_dirname): logging.error('batch direcotry %s already exists', args.batch_dirname) return 10 else: os.mkdir(args.batch_dirname) data = {} record_counts = 0 # read in the Genbank records with open_compressed(args.genbank_filename, 'rt') as genbank_handle: for record in SeqIO.parse(genbank_handle, 'genbank'): record_counts += 1 # store the data by year and key key, year = calculate_key_year(record) if year not in data: data[year] = {} if key not in data[year]: data[year][key] = [] data[year][key].append(record) # make a directory for each year for year in data: os.mkdir('%s/%04d' % (args.batch_dirname, year)) # for each key, make a file with the Genbank records for key in data[year]: if type(key) is int: filename = '%s/%04d/pmid_%d.genbank' % (args.batch_dirname, year, key) else: filename = '%s/%04d/hash_%d.genbank' % (args.batch_dirname, year, abs(hash(key))) with open(filename, 'wt') as handle: SeqIO.write(data[year][key], handle, 'genbank') elapsed_time = time.time() - start_time logging.info( 'elapsed time %s', time.strftime('%H hours, %M minutes, %S seconds', time.gmtime(elapsed_time)))
def deduce_schema(filename, examine_records=1000): with open_compressed(filename, 'rb') as file_handle: reader = fastavro.reader(file_handle) # read a batch of records, size examine_records, to figure out what columns to make parses_ids = set() lineages_ids = set() for record in itertools.islice(reader, examine_records): if 'parses' in record: parses_ids.update(record['parses'].keys()) if 'lineages' in record: lineages_ids.update(record['lineages'].keys()) schema = pa.schema([ pa.field('subject', pa.string(), nullable=False), pa.field('sample', pa.string(), nullable=True), pa.field('source', pa.string(), nullable=False), pa.field('name', pa.string(), nullable=False), pa.field('sequence', pa.binary(), nullable=False) ] + [ pa.field('parse_' + p, pa.binary(), nullable=True) for p in sorted(parses_ids) ] + [ pa.field('lineage_' + p, pa.string(), nullable=True) for p in sorted(lineages_ids) ]) return schema, parses_ids, lineages_ids
def main(): parser = argparse.ArgumentParser( description='add the given subject to an Avro file', formatter_class=argparse.ArgumentDefaultsHelpFormatter) # input files parser.add_argument('seq_record_filename', metavar='seq_record.avro', help='the Avro file with the sequence records') parser.add_argument('subject', metavar='S', help='the subject to set') args = parser.parse_args() logging.basicConfig(level=logging.INFO) start_time = time.time() with open_compressed(args.seq_record_filename, 'rb') as seq_record_handle: seq_record_reader = fastavro.reader(seq_record_handle) fastavro.writer(sys.stdout.buffer, seq_record_reader.writer_schema, subject_adder(seq_record_reader, args.subject), codec='bzip2') elapsed_time = time.time() - start_time logging.info( 'elapsed time %s', time.strftime('%H hours, %M minutes, %S seconds', time.gmtime(elapsed_time)))
def main(): parser = argparse.ArgumentParser(description='subset Genbank files bases on organism and length', formatter_class=argparse.ArgumentDefaultsHelpFormatter) # files parser.add_argument('genbank_filenames', metavar='genbank_file', nargs='+', help='the file with the Genbank records') parser.add_argument('--organism', '-o', metavar='O', type=str, default='H**o sapiens', help='only process records with the given organism') parser.add_argument('--max-size', '-m', metavar='M', type=int, default=50000, help='ignore sequences longer than this') args = parser.parse_args() logging.basicConfig(level=logging.INFO) start_time = time.time() processed_record_count = 0 for genbank_filename in args.genbank_filenames: logging.info('processing %s', genbank_filename) with open_compressed(genbank_filename, 'rt') as genbank_file: records = SeqIO.parse(genbank_file, 'genbank') filtered_records = genbank_filter(records, organism=args.organism, max_size=args.max_size) SeqIO.write(filtered_records, sys.stdout, 'genbank') elapsed_time = time.time() - start_time logging.info('elapsed time %s', time.strftime('%H hours, %M minutes, %S seconds', time.gmtime(elapsed_time)))
def main(): parser = argparse.ArgumentParser(description='convert a sequence records in Avro file format to FASTA or FASTQ') # input files parser.add_argument('seq_record_filenames', metavar='seq_record.avro', nargs='+', help='the Avro file with the sequence records') # options output_format = parser.add_mutually_exclusive_group() output_format.add_argument('--fasta', '-a', default=True, action='store_true', help='output a FASTA file') output_format.add_argument('--fastq', '-q', action='store_false', dest='fasta', help='output a FASTQ file') args = parser.parse_args() logging.basicConfig(level=logging.INFO) start_time = time.time() for input_filename in args.seq_record_filenames: with open_compressed(input_filename, 'rb') as seq_record_handle: seq_record_reader = fastavro.reader(seq_record_handle) for record in seq_record_reader: if args.fasta: print('>%s\n%s' % (record['name'], record['sequence']['sequence'])) else: print('@%s\n%s\n+\n%s' % (record['name'], record['sequence']['sequence'], record['sequence']['qual'])) elapsed_time = time.time() - start_time logging.info('elapsed time %s', time.strftime('%H hours, %M minutes, %S seconds', time.gmtime(elapsed_time)))
def main(): parser = argparse.ArgumentParser( description='', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('parse_label', metavar='label', help='the parse label to use for the parse') parser.add_argument('filenames', metavar='file', nargs='+', help='the Avro file to read') args = parser.parse_args() print('accession', 'description', 'v_name', 'd_name', 'j_name', sep='\t') for filename in args.filenames: with open_compressed(filename, 'rb') as read_handle: reader = fastavro.reader(read_handle) for record in reader: name = record['name'] assert name.startswith('genbank:') accession = name.split(':')[1] parse = record['parses'][args.parse_label] v_name, _, d_name, _, j_name, _ = best_vdj_score(parse) description = None if 'description' in record['sequence']['annotations']: description = record['sequence']['annotations'][ 'description'] print(accession, description, v_name, d_name, j_name, sep='\t')
def igblast_chain(igblast_filenames): for filename in igblast_filenames: with open_compressed(filename, 'rt') as igblast_handle: logging.info('processing parsed in %s', filename) igblast_parse_reader = IgBLASTParser(igblast_handle) for record in igblast_parse_reader: yield record
def main(): parser = argparse.ArgumentParser( description= 'extract the read names and alignment scores from a SAM file', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('sam_filenames', metavar='file.sam', nargs='*', default=['-'], help='the SAM files to process') args = parser.parse_args() logging.basicConfig(level=logging.INFO) start_time = time.time() record_template = {'pair_id': None, 'align_score': None} writer = csv.DictWriter(sys.stdout, fieldnames=record_template.keys()) writer.writeheader() for sam_filename in args.sam_filenames: with open_compressed(sam_filename, 'rt') as sam_file_handle: for ident, score in basic_sam_parser_match(sam_file_handle): record = record_template.copy() record['pair_id'] = ident record['align_score'] = score writer.writerow(record) elapsed_time = time.time() - start_time logging.info( 'elapsed time %s', time.strftime('%H hours, %M minutes, %S seconds', time.gmtime(elapsed_time)))
def main(): parser = argparse.ArgumentParser( description= 'get the counts of subject and sources from sequence records objects', formatter_class=argparse.ArgumentDefaultsHelpFormatter) # input files parser.add_argument('seq_record_filename', metavar='seq_record.avro', nargs='*', help='the Avro file with the sequence records') args = parser.parse_args() logging.basicConfig(level=logging.INFO) start_time = time.time() subject_source_counts = Counter() if len(args.seq_record_filename) == 0: seq_record_filenames = ['-'] else: seq_record_filenames = args.seq_record_filename for record_filename in seq_record_filenames: logging.info('processing file %s', record_filename) with open_compressed(record_filename, 'rb') as seq_record_handle: for record in fastavro.reader(seq_record_handle): subject = record['subject'] source = record['source'] subject_source_counts[(subject, source)] += 1 for subject, source in subject_source_counts: print(subject, source, subject_source_counts[(subject, source)], sep='\t')
def main(): parser = argparse.ArgumentParser(description='generate barcode and primer informations for FASTQ read pairs', formatter_class=argparse.ArgumentDefaultsHelpFormatter) # what to put into the source field parser.add_argument('source', metavar='source', help='what to put into the source field') # the bacode map parser.add_argument('barcode_map_filename', metavar='barcode_map.csv', help='CSV file with the barcode map') # file with the barcodes and targeting parser.add_argument('barcodes_targets_filenames', metavar='idents.csv', nargs='+', help='CSV file with the barcodes and targets') args = parser.parse_args() logging.basicConfig(level=logging.INFO) logging.info('loading barcode map') barcode_map = {} with open(args.barcode_map_filename, 'r') as map_handle: for row in csv.DictReader(map_handle): assert args.source == row['run_label'] key = (row['barcode1'], row['target1'], row['barcode2'], row['target2']) assert key not in barcode_map # make sure there are no duplcate rows barcode_map[key] = (row['participant_label'], row['replicate_label']) logging.info('loaded %d entries', len(barcode_map)) for barcodes_targets_filename in args.barcodes_targets_filenames: logging.info('processing file %s', barcodes_targets_filename) with open_compressed(barcodes_targets_filename, 'rt') as barcode_targets_handle: for record in csv.DictReader(barcode_targets_handle): key = (record['barcode1:name'], record['target1:name'], record['barcode2:name'], record['target2:name']) if key not in barcode_map: # if found, annotated with subject, sample print(*key, sep=',')
def main(): parser = argparse.ArgumentParser( description='validate sequence records from Avro files', formatter_class=argparse.ArgumentDefaultsHelpFormatter) # input files parser.add_argument('repertoire_filenames', metavar='repertoire-file', nargs=3, help='the V(D)J repertoire file used in IgBLAST') parser.add_argument('sequence_record_filenames', metavar='seq_record.avro', nargs='+', help='Avro files with the sequence records to test') args = parser.parse_args() logging.basicConfig(level=logging.INFO) start_time = time.time() v_repertoire = { r.id: str(r.seq) for r in SeqIO.parse(args.repertoire_filenames[0], 'fasta') } d_repertoire = { r.id: str(r.seq) for r in SeqIO.parse(args.repertoire_filenames[1], 'fasta') } j_repertoire = { r.id: str(r.seq) for r in SeqIO.parse(args.repertoire_filenames[2], 'fasta') } record_count = 0 error = False for filename in args.sequence_record_filenames: logging.info('processing file %s', filename) if error: break with open_compressed(filename, 'rb') as input_handle: for record in fastavro.reader(input_handle): if not tests.test_parse_alignment_structure(record): pprint(record) error = True if not tests.test_parse_alignment_sequences( record, v_repertoire, d_repertoire, j_repertoire): pprint(record) error = True if error: break record_count += 1 logging.info('processed %d sequence records', record_count) elapsed_time = time.time() - start_time logging.info( 'elapsed time %s', time.strftime('%H hours, %M minutes, %S seconds', time.gmtime(elapsed_time)))
def avro_2nd_field_missable_iterator(filename, fieldname1, fieldname2): with open_compressed(filename, 'rb') as file_handle: reader = fastavro.reader(file_handle) for record in reader: if fieldname2 in record[fieldname1]: yield record[fieldname1][fieldname2] else: yield None
def main(): parser = argparse.ArgumentParser(description='generate barcode and primer informations for FASTQ read pairs', formatter_class=argparse.ArgumentDefaultsHelpFormatter) # what to put into the source field parser.add_argument('source', metavar='source', help='what to put into the source field') # the bacode map parser.add_argument('barcode_map_filename', metavar='barcode_map.csv', help='CSV file with the barcode map') # file with the barcodes and targeting parser.add_argument('barcodes_targets_filename', metavar='idents.csv', help='CSV file with the barcodes and targets') args = parser.parse_args() logging.basicConfig(level=logging.INFO) start_time = time.time() logging.info('loading barcode map') barcode_map = {} with open(args.barcode_map_filename, 'r') as map_handle: for row in csv.DictReader(map_handle): assert args.source == row['run_label'] key = (row['barcode1'], row['target1'], row['barcode2'], row['target2']) assert key not in barcode_map # make sure there are no duplcate rows barcode_map[key] = (row['participant_label'], row['replicate_label']) logging.info('loaded %d entries', len(barcode_map)) demuxing_template = {'pair_id': None, 'source': args.source, 'subject': None, 'sample': None} writer = csv.DictWriter(sys.stdout, fieldnames=demuxing_template.keys()) writer.writeheader() logging.info('annotating reads') read_pair_count = 0 annotated_pair_count = 0 with open_compressed(args.barcodes_targets_filename, 'rt') as barcode_targets_handle: for record in csv.DictReader(barcode_targets_handle): read_pair_count += 1 demuxing = demuxing_template.copy() demuxing['pair_id'] = record['pair_id'] # form the lookup key = (record['barcode1:name'], record['target1:name'], record['barcode2:name'], record['target2:name']) if key in barcode_map: # if found, annotated with subject, sample annotated_pair_count += 1 particiapant, sample = barcode_map[key] demuxing['subject'] = particiapant demuxing['sample'] = sample writer.writerow(demuxing) logging.info('processed %d read pairs', read_pair_count) logging.info('annotated %d (%f%%) read pairs', annotated_pair_count, 100.0*annotated_pair_count/read_pair_count) elapsed_time = time.time() - start_time logging.info('elapsed time %s', time.strftime('%H hours, %M minutes, %S seconds', time.gmtime(elapsed_time)))
def main(): parser = argparse.ArgumentParser( description= 'concatinate two FASTQ files, rc\'ing the second one, with a given spacer in between', formatter_class=argparse.ArgumentDefaultsHelpFormatter) # input file parser.add_argument('fastq1_filename', metavar='file1.fq', help='the R1 FASTQ file') parser.add_argument( 'fastq2_filename', metavar='file2.fq', help= 'the R2 FASTQ file to reverse complement and concatinate to the above') # options parser.add_argument('--spacer', '-s', type=str, default='XXXXXXXX', help='the spacer sequence') parser.add_argument('--qual-spacer', '-q', type=int, default=0, help='the qual score to assign to the spacer') args = parser.parse_args() spacer_seq = args.spacer spacer_qual = chr(33 + args.qual_spacer) * len(spacer_seq) # read the FASTQ file with open_compressed(args.fastq1_filename, 'rt') as input1_handle, \ open_compressed(args.fastq2_filename, 'rt') as input2_handle: for r1, r2 in zip(FastqGeneralIterator(input1_handle), FastqGeneralIterator(input2_handle)): r1_id, r1_seq, r1_qual = r1 r2_id, r2_seq, r2_qual = r2 r1_seq += spacer_seq + r2_seq.translate( _dna_complement_table)[::-1] r1_qual += spacer_qual + r2_qual[::-1] print('@%s\n%s\n+\n%s' % (r1_id, r1_seq, r1_qual))
def main(): parser = argparse.ArgumentParser( description='get the list of subjects in a set of sequence records', formatter_class=argparse.ArgumentDefaultsHelpFormatter) # input files parser.add_argument('seq_record_filename', metavar='seq_record.avro', nargs='*', help='the Avro file with the sequence records') # options parser.add_argument('-s', '--sort-counts', action='store_true', help='sort by per-subject counts') parser.add_argument('-c', '--show-counts', action='store_true', help='show the per-subjects counts') parser.add_argument('-n', '--show-none', action='store_true', help='show the un-assigned records') args = parser.parse_args() logging.basicConfig(level=logging.INFO) start_time = time.time() subject_counts = Counter() if len(args.seq_record_filename) == 0: seq_record_filenames = ['-'] else: seq_record_filenames = args.seq_record_filename for record_filename in seq_record_filenames: logging.info('processing file %s', record_filename) with open_compressed(record_filename, 'rb') as seq_record_handle: subject_counts.update( get_subjects(fastavro.reader(seq_record_handle))) if args.sort_counts: for s, c in subject_counts.most_common(): if args.show_none or s is not None: if args.show_counts: print(s, c, sep='\t') else: print(s) else: for s in sorted(subject_counts.keys(), key=str): if args.show_none or s is not None: if args.show_counts: c = subject_counts[s] print(s, c, sep='\t') else: print(s)
def main(): parser = argparse.ArgumentParser( description='generate a sheet for Genbank immune receptor annotation', formatter_class=argparse.ArgumentDefaultsHelpFormatter) # input file parser.add_argument('genbank_filename', metavar='genbank-file', help='the file with the Genbank records') args = parser.parse_args() logging.basicConfig(level=logging.INFO) start_time = time.time() record_counts = 0 excel_filename = args.genbank_filename if '.' in excel_filename: excel_filename = excel_filename[:excel_filename.rindex('.')] excel_filename += '.xlsx' # read in the Genbank records with open_compressed(args.genbank_filename, 'rt') as genbank_handle: # load all the records records = list(SeqIO.parse(genbank_handle, 'genbank')) # get a unique references list for all records references = get_master_references(records) # create the workbook workbook = xlsxwriter.Workbook(excel_filename) curation_worksheet = workbook.add_worksheet('Curation') records_worksheet = workbook.add_worksheet('Records') # write the references to the sheet current_row = write_references(workbook, curation_worksheet, references) current_row += 1 # write curation annotation current_row = write_curation_row(workbook, curation_worksheet, records, current_row) # write_genbank_records(workbook, records_worksheet, records) workbook.close() elapsed_time = time.time() - start_time logging.info( 'elapsed time %s', time.strftime('%H hours, %M minutes, %S seconds', time.gmtime(elapsed_time)))
def main(): parser = argparse.ArgumentParser(description='extract sequence records from Avro files with a given subject', formatter_class=argparse.ArgumentDefaultsHelpFormatter) # input files parser.add_argument('source_record_filename', metavar='seq_record.avro', nargs='+', help='Avro files with the sequence records to extract') parser.add_argument('dest_record_filename', metavar='target.avro', help='the destination Avro file') parser.add_argument('subject_label', metavar='subject', help='the subject to extract, use none for un-assigned records') # append parser.add_argument('-a', '--append', action='store_true', help='append records to an existing Avro file') args = parser.parse_args() logging.basicConfig(level=logging.INFO) start_time = time.time() if os.path.exists(args.dest_record_filename): if args.append: logging.info('appending to existing sequence record file %s', args.dest_record_filename) else: logging.error('destination file already exists, to append use the --apppend/-a flag to add to it, exiting') return 10 if args.subject_label == 'none': args.subject_label = None logging.info('extracting records for subject %s', args.subject_label) # read in the first file to get the schema with open_compressed(args.source_record_filename[0], 'rb') as seq_record_handle: reader = fastavro.reader(open_compressed(args.source_record_filename[0], 'rb')) schema = reader.writer_schema # open and append to the destination file with open_compressed(args.dest_record_filename, 'a+b') as dest_record_handle: fastavro.writer(dest_record_handle, schema, avro_file_record_filter_iter(args.source_record_filename, args.subject_label), codec='bzip2') elapsed_time = time.time() - start_time logging.info('elapsed time %s', time.strftime('%H hours, %M minutes, %S seconds', time.gmtime(elapsed_time)))
def parse_chain(filenames, file_format, mode='rt'): for filename in filenames: with open_compressed(filename, mode) as handle: if file_format in [ 'avro', 'seq_rec', 'seq_record', 'sequence_record' ]: seq_record_reader = fastavro.reader(handle) for record in seq_record_reader: result = SeqRecord(Seq(record['sequence']['sequence']), id=record['name'], description='') dict.__setitem__(result._per_letter_annotations, 'phred_quality', record['sequence']['qual']) yield result else: for record in SeqIO.parse(handle, file_format): yield record
def genbank_filter_chain(filenames, organism=None, max_length=None): processed_read_count = 0 for genbank_filename in filenames: logging.info('processing %s', genbank_filename) with open_compressed(genbank_filename, 'rt') as genbank_file: for record in SeqIO.parse(genbank_file, 'genbank'): if (organism is None) or (organism == record.annotations['organism']): sequence_length = len(record.seq) if (max_length is None) or (sequence_length <= max_length): processed_read_count += 1 yield seq_record_from_genbank(record) else: logging.info( 'record %s (%s) is too big, %d > %s, ignoring', record.id, record.description, sequence_length, max_length) logging.info('processed %s read pairs', processed_read_count)
def main(): parser = argparse.ArgumentParser( description='get the clone counts in the given Avro files', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('lineage_label', metavar='label', help='the clone label to use') parser.add_argument('filenames', metavar='file', nargs='+', help='the Avro files to read') args = parser.parse_args() clones_counts = defaultdict(int) for filename in args.filenames: with open_compressed(filename, 'rb') as read_handle: reader = fastavro.reader(read_handle) for record in reader: if args.lineage_label in record['lineages']: subject = record['subject'] source = record['source'] type_ = record['sequence']['annotations']['target1'] lineage = record['lineages'][args.lineage_label] clones_counts[(subject, source, type_, lineage)] += 1 writer = csv.DictWriter( sys.stdout, fieldnames=['subject', 'source', 'type', 'lineage', 'read_count']) writer.writeheader() for (subject, source, type_, lineage), read_count in clones_counts.items(): row = { 'subject': subject, 'source': source, 'type': type_, 'lineage': lineage, 'read_count': read_count } writer.writerow(row)
def main(): parser = argparse.ArgumentParser(description='extract immune receptor sequences from Genbank records', formatter_class=argparse.ArgumentDefaultsHelpFormatter) # input files parser.add_argument('genbank_filename', metavar='genbank-file', help='the file with the Genbank records') parser.add_argument('igblast_output_filenames', metavar='parse.igblast', nargs='+', help='the output of IgBLAST to get the scores from') # options parser.add_argument('--min-v-score', metavar='S', type=float, default=70.0, help='the minimum score for the V-segment') parser.add_argument('--min-j-score', metavar='S', type=float, default=26.0, help='the minimum score for the V-segment') args = parser.parse_args() logging.basicConfig(level=logging.INFO) start_time = time.time() min_v_score = args.min_v_score min_j_score = args.min_j_score with open_compressed(args.genbank_filename, 'rt') as genbank_handle: for genbank_record, igblast_record in zip(SeqIO.parse(genbank_handle, 'genbank'), igblast_parse_chain(args.igblast_output_filenames)): assert genbank_record.id == igblast_record.query_name.split(' ')[0] if igblast_record: # get the best scores best_scores = defaultdict(float) for align_line in igblast_record.alignment_lines[1:]: segment_type = align_line.segment_type align_score = igblast_record.significant_alignments[align_line.name] # save the best score for each segment type best_scores[segment_type] = max(best_scores[segment_type], align_score.bit_score) if best_scores['V'] >= min_v_score and best_scores['J'] >= min_j_score: SeqIO.write(genbank_record, sys.stdout, 'genbank') elapsed_time = time.time() - start_time logging.info('elapsed time %s', time.strftime('%H hours, %M minutes, %S seconds', time.gmtime(elapsed_time)))
def main(): parser = argparse.ArgumentParser( description= 'batch paired-end sequences from an Illumina run of an amplicon library', formatter_class=argparse.ArgumentDefaultsHelpFormatter) # input file parser.add_argument('fastq_filename', metavar='file.fq', default='-', help='FASTQ file to convert') # options parser.add_argument('-t', '--trim-label', action='store_true', help='trimmed the read label') args = parser.parse_args() # read the FASTQ file with open_compressed(args.fastq_filename, 'rt') as input_handle: for read_id, read_seq, read_qual in FastqGeneralIterator(input_handle): if args.trim_label: read_id = read_id.split(' ')[0] print('>%s\n%s' % (read_id, read_seq))
def main(): parser = argparse.ArgumentParser( description='get the CDR3 length from an Avro file', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('filename', metavar='file', help='the Avro file to read') args = parser.parse_args() reader = fastavro.reader(open_compressed(args.filename, 'rb')) subject = None read_counts_type = defaultdict(int) for record in reader: if subject is None: subject = record['subject'] else: assert subject == record['subject'] type_ = record['sequence']['annotations']['target1'] read_counts_type[type_] += 1 for type_, count in read_counts_type.items(): print(subject, type_, count, sep='\t')
def main(): parser = argparse.ArgumentParser( description= 'batch paired-end sequences from an Illumina run of an amplicon library', formatter_class=argparse.ArgumentDefaultsHelpFormatter) # directory to store the batches in parser.add_argument('batch_dirname', metavar='dir', help='name for the batch directory') # input files parser.add_argument('r1_filename', metavar='r1_file', help='the file with the read 1 sequences') parser.add_argument('r2_filename', metavar='r2_file', help='the file with the read 2 sequences') # parameters parser.add_argument('--batch-size', '-b', metavar='B', type=int, default=50000, help='the number of read pairs to insert at a time') args = parser.parse_args() logging.basicConfig(level=logging.INFO) start_time = time.time() # check if the batch directory already exists if os.path.exists(args.batch_dirname): logging.error('batch direcotry %s already exists', args.batch_dirname) return 10 else: os.mkdir(args.batch_dirname) read_count = 0 batch_count = 0 # read the FASTQ files with open_compressed(args.r1_filename, 'rt') as in_read1_handle, \ open_compressed(args.r2_filename, 'rt') as in_read2_handle: # iterate over the read files for r1_batch, r2_batch in zip( batches(FastqGeneralIterator(in_read1_handle), args.batch_size), batches(FastqGeneralIterator(in_read2_handle), args.batch_size)): # filename prefix for the batches batch_prefix = os.path.join(args.batch_dirname, 'batch%06d' % batch_count) logging.info('creating batch %06d', batch_count) # compressed batch outout files with gzip.open(batch_prefix + '.fq1.gz', 'wt') as out_read1_handle, \ gzip.open(batch_prefix + '.fq2.gz', 'wt') as out_read2_handle: # for each read pair in the batch for r1_read, r2_read in zip(r1_batch, r2_batch): r1_id, r1_seq, r1_qual = r1_read r2_id, r2_seq, r2_qual = r2_read # check that the read ids are the same assert r1_id.split(' ')[0] == r2_id.split( ' ')[0], 'read ids do not match %s != %s' % (r1_id, r2_id) out_read1_handle.write('@%s\n%s\n+\n%s\n' % (r1_id, r1_seq, r1_qual)) out_read2_handle.write('@%s\n%s\n+\n%s\n' % (r2_id, r2_seq, r2_qual)) read_count += 1 batch_count += 1 logging.info('processed %d read pairs', read_count) logging.info('created %d batches', batch_count) elapsed_time = time.time() - start_time logging.info( 'elapsed time %s', time.strftime('%H hours, %M minutes, %S seconds', time.gmtime(elapsed_time)))
def avro_file_record_filter_iter(filenames, subject): for filename in filenames: logging.info('processing file %s', filename) for record in fastavro.reader(open_compressed(filename, 'rb')): if record['subject'] == subject: yield record
def main(): parser = argparse.ArgumentParser( description='generate some basic stats from a set of sequence records', formatter_class=argparse.ArgumentDefaultsHelpFormatter) # input files parser.add_argument('seq_record_filenames', metavar='seq_record.avro', nargs='+', help='Avro files with the sequence records') # parser.add_argument('--parse-label', '-p', metavar='L', help='collect stats on the given parse label') args = parser.parse_args() logging.basicConfig(level=logging.INFO) start_time = time.time() record_count = 0 no_subject = 0 no_subject_full_ident = 0 no_subject_phix = 0 no_subject_parsed = 0 yes_subject = 0 yes_subject_parsed = 0 for filename in args.seq_record_filenames: logging.info('processing sequence record file %s', filename) with open_compressed(filename, 'rb') as input_handle: reader = fastavro.reader(input_handle) for record in reader: record_count += 1 if record['subject'] is None: no_subject += 1 if record['sequence']['annotations']['barcode1'] is not None and \ record['sequence']['annotations']['target1'] is not None and \ record['sequence']['annotations']['barcode1'] is not None and \ record['sequence']['annotations']['target2'] is not None: no_subject_full_ident += 1 if record['sequence']['annotations']['phix1'] > 0 or \ record['sequence']['annotations']['phix2'] > 0: no_subject_phix += 1 if args.parse_label: if args.parse_label in record['parses'] and record[ 'parses'][args.parse_label] is not None: no_subject_parsed += 1 else: yes_subject += 1 if args.parse_label: if args.parse_label in record['parses'] and record[ 'parses'][args.parse_label] is not None: yes_subject_parsed += 1 print('processed %d records' % record_count) print(' %d (%0.2f%%) had subject' % (yes_subject, 100 * yes_subject / record_count)) if args.parse_label: print(' %d (%0.2f%%) of those had parses (%s)' % (yes_subject_parsed, 100 * yes_subject_parsed / yes_subject, args.parse_label)) print(' %d (%0.2f%%) had no subject' % (no_subject, 100 * no_subject / record_count)) if no_subject == 0: pass else: print(' %d (%0.2f%%) of those had parses (%s)' % (no_subject_parsed, 100 * no_subject_parsed / no_subject, args.parse_label)) print(' %d (%0.2f%%) of those where PhiX' % (no_subject_phix, 100 * no_subject_phix / no_subject)) print( ' %d (%0.2f%%) of those had full idents' % (no_subject_full_ident, 100 * no_subject_full_ident / no_subject)) elapsed_time = time.time() - start_time logging.info( 'elapsed time %s', time.strftime('%H hours, %M minutes, %S seconds', time.gmtime(elapsed_time)))
def igblast_parse_chain(filenames): for filename in filenames: with open_compressed(filename, 'rt') as igblast_handle: igblast_parse_reader = IgBLASTParser(igblast_handle) for parse in igblast_parse_reader: yield parse
def main(): parser = argparse.ArgumentParser( description='get the CDR3 length from an Avro file', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('parse_label', metavar='label', help='the parse label to use for the parse') parser.add_argument('filenames', metavar='file', nargs='+', help='the Avro file to read') parser.add_argument('--lineage', '-l', metavar='L', help='the lineage label to use') parser.add_argument('--min-v-score', '-v', metavar='S', default=70, help='minimum V-segment score') parser.add_argument('--min-j-score', '-j', metavar='S', default=26, help='minimum J-segment score') args = parser.parse_args() writer = None for filename in args.filenames: with open_compressed(filename, 'rb') as read_handle: reader = fastavro.reader(read_handle) for record in reader: parse = record['parses'][args.parse_label] best_v, v_score, _, _, _, j_score = best_vdj_score(parse) if v_score is not None and j_score is not None and \ v_score >= args.min_v_score and j_score >= args.min_j_score: subject = record['subject'] type_ = record['sequence']['annotations']['target1'] v_j_in_frame = parse['v_j_in_frame'] has_stop_codon = parse['has_stop_codon'] best_q = get_parse_query(parse) assert best_q['padding']['start'] == 0 q_align = best_q['alignment'] v_align = best_v['alignment'] mut_level = mutation_level(q_align, v_align) if writer is None: if args.lineage: writer = csv.DictWriter(sys.stdout, fieldnames=[ 'subject', 'source', 'type', 'lineage', 'v_j_in_frame', 'has_stop_codon', 'mutation_level' ]) else: writer = csv.DictWriter(sys.stdout, fieldnames=[ 'subject', 'source', 'type', 'v_j_in_frame', 'has_stop_codon', 'mutation_level' ]) writer.writeheader() row = { 'subject': record['subject'], 'source': record['source'], 'type': type_, 'v_j_in_frame': v_j_in_frame, 'has_stop_codon': has_stop_codon, 'mutation_level': mut_level } if args.lineage: if args.lineage in record['lineages']: row['lineage'] = record['lineages'][args.lineage] writer.writerow(row)
def main(): parser = argparse.ArgumentParser( description= 'sort the sequence records in the given Avro file into a HIVE style directory structure', formatter_class=argparse.ArgumentDefaultsHelpFormatter) # input files parser.add_argument( 'pathname_base', metavar='dir_path', help='the base pathname for the HIVE style directory structure') parser.add_argument('seq_record_filenames', metavar='seq_rec.avro', nargs='+', help='the Avro file with the sequence records') # options arg_group = parser.add_mutually_exclusive_group(required=False) arg_group.add_argument('--no-none', action='store_true', help='do not process records without a subject') arg_group.add_argument('--only-none', action='store_true', help='only process records without a subject') args = parser.parse_args() logging.basicConfig(level=logging.INFO) start_time = time.time() # make the base directory if it doesn't already exist base_path = args.pathname_base logging.info('making base directory %s', base_path) try: os.mkdir(base_path) except FileExistsError: logging.info('base directory already exists, adding to it') # load the records in and group them by subject and source into a list with tempfile.TemporaryDirectory() as temp_dir_name: logging.info('writing sequences to %s', temp_dir_name) temp_handles = {} temp_writers = {} for filename in args.seq_record_filenames: logging.info('loading sequence records from %s', filename) with open_compressed(filename, 'rb') as seq_record_handle: seq_record_reader = fastavro.reader(seq_record_handle) for record in seq_record_reader: subject = record['subject'] source = record['source'] if args.no_none and source is None: continue elif args.only_none and source is not None: continue if subject not in temp_handles: temp_handles[subject] = {} temp_writers[subject] = {} if source not in temp_handles[subject]: temp_handles[subject][source] = open( os.path.join( temp_dir_name, f'subject={subject},source={source}.avro'), 'wb') temp_writers[subject][source] = open_avro( temp_handles[subject][source], seq_record_reader.writer_schema) temp_writers[subject][source].write(record) # flush and close writers and handles for subject in temp_handles: for source in temp_handles[subject]: temp_writers[subject][source].flush() temp_handles[subject][source].close() del temp_writers logging.info('writing output') # for subject in temp_handles: subject_path = os.path.join(base_path, f'subject={subject}') # make sure the subject directory is create if os.path.isdir(subject_path): logging.info( 'using existing subject directory %s and adding to it', subject_path) else: logging.info('making subject directory %s', subject_path) os.mkdir(subject_path) # for each (subject, source) for source in temp_handles[subject]: logging.info( 'loading and sorting records subject=%s/source=%s', subject, source) with open( os.path.join( temp_dir_name, f'subject={subject},source={source}.avro'), 'rb') as input_handle: reader = fastavro.reader(input_handle) records = list(reader) records.sort(key=itemgetter('name')) logging.info('writing records subject=%s/source=%s', subject, source) output_filename = os.path.join(base_path, f'subject={subject}', f'source={source}.avro') with open(output_filename, 'wb') as output_handle: fastavro.writer(output_handle, reader.writer_schema, records, codec='bzip2') del records elapsed_time = time.time() - start_time logging.info( 'elapsed time %s', time.strftime('%H hours, %M minutes, %S seconds', time.gmtime(elapsed_time)))
def avro_1st_field_iterator(filename, fieldname): with open_compressed(filename, 'rb') as file_handle: reader = fastavro.reader(file_handle) for record in reader: yield record[fieldname]