def process_files(options): input_filenames = [options.umi] + options.seq input_files = [pyfastx.Fastx(fname) for fname in input_filenames] output_filenames = [make_output_filename(name) for name in options.seq] output_files = [open(fname, "w") for fname in output_filenames] for records in zip_longest(*input_files): if len(records) >= 1: umi_record = records[0] if umi_record is not None and len(umi_record) == 4: umi_name, umi_seq, _umi_qual, umi_comment = umi_record elif umi_record is None: exit_with_error(f"Input FASTQ files do not have the same number of records", EXIT_FILE_IO_ERROR) else: exit_with_error(f"Badly formed UMI record in input UMI FASTQ file: {umi_record}", EXIT_FILE_IO_ERROR) fastq_records = records[1:] for output_file, this_record in zip(output_files, fastq_records): if this_record is not None and len(this_record) == 4: this_name, this_seq, this_qual, this_comment = this_record if this_name == umi_name: new_name = this_name + options.sep + umi_seq print(f"@{new_name} {this_comment}\n{this_seq}\n+\n{this_qual}", file=output_file) elif this_record is None: exit_with_error(f"Input FASTQ files do not have the same number of records", EXIT_FILE_IO_ERROR) else: exit_with_error(f"Badly formed FASTQ record in input FASTQ file: {this_record}", EXIT_FILE_IO_ERROR) # FASTX does not appear to provide a proper context manager for files, so # we resort to trying to close files here. # Issue has been created on GitHub: https://github.com/lmdu/pyfastx/issues/27 #for file in input_files: # file.close() for file in output_files: file.close()
def find_tandem_repeats_with_multicore(args): pool = mp.Pool(args.threads) manager = mp.Manager() tasks = manager.Queue(args.threads*2) event = manager.Event() #add workers l = len(str(args.threads)) for i in range(args.threads): work_id = str(i).zfill(l) pool.apply_async(find_tandem_repeats_worker, (args, tasks, event, work_id)) #add tasks for name, seq, _ in pyfastx.Fastx(args.fasta, uppercase=True): tasks.put((name, seq), block=True, timeout=None) event.set() pool.close() pool.join() #merge results with open(args.out_file, 'wb') as fw: for i in range(args.threads): temp_file = "{}.{}".format(args.out_file, str(i).zfill(l)) if os.path.isfile(temp_file): with open(temp_file, 'rb') as fh: shutil.copyfileobj(fh, fw) #remove temp file os.remove(temp_file)
def test_fastq_iter(self): reads = {} with open(flat_fastq) as fh: for line in fh: line = line.strip() if line[0] == '@': name = line.split()[0][1:] comment = ' '.join(line.split()[1:]) reads[name] = [comment] elif line == '+': continue else: reads[name].append(line) for name, seq, qual, comment in pyfastx.Fastx(gzip_fastq, "fastq"): r = reads[name] self.assertEqual(r[0], comment) self.assertEqual(r[1], seq) self.assertEqual(r[2], qual)
def process(self): progress = 0 processed_size = 0 processed_file = 0 with multiprocessing.Pool(1) as pool: for fasta in self.fastas: self.findex = fasta[0] total_size = fasta[2] #create ssr table for current file DB.create_table(self._table, self.findex) self.change_status('running') seqs = pyfastx.Fastx(fasta[4], uppercase=True) sql = self.sql() for name, seq, _ in seqs: self.signals.messages.emit( 'processing sequence {} in file {}'.format( name, fasta[1])) proc = pool.apply_async(self.search, self.args(name, seq)) trs = proc.get() DB.insert_rows(sql, self.rows(trs)) processed_size += len(seq) if processed_size > total_size: r = 0 else: r = processed_size / total_size p = int((processed_file + r) / self.total_file * 100) if p > progress: self.signals.progress.emit(p) progress = p processed_file += 1 self.change_status('success')
def find_tandem_repeats_with_singlecore(args): with open(args.out_file, 'w') as fw: for name, seq, _ in pyfastx.Fastx(args.fasta, uppercase=True): tres = find_tandem_repeats_by_type(name, seq, args) format_and_write_to_file(args, fw, tres)
def test_exception(self): with self.assertRaises(FileExistsError): _ = pyfastx.Fastx('test_file') with self.assertRaises(RuntimeError): _ = pyfastx.Fastx(gzip_fasta, format="fastx")
def test_fastx_repr(self): fa = pyfastx.Fastx(gzip_fasta, "fasta") self.assertEqual(repr(fa), "<Fastx> fasta {}".format(gzip_fasta)) fq = pyfastx.Fastx(gzip_fastq, "fastq") self.assertEqual(repr(fq), "<Fastx> fastq {}".format(gzip_fastq))
def test_fasta_upper(self): for name, seq, _ in pyfastx.Fastx(flat_fasta, uppercase=True): self.assertEqual(str(self.faidx[name]), seq)
def test_fasta_iter(self): for name, seq, comment in pyfastx.Fastx(gzip_fasta): s = self.faidx[name] self.assertEqual(str(seq), seq) self.assertEqual(' '.join(s.long_name.split()[1:]), comment)
import pyfastx import simplesam import os os.chdir( '/research/projects/yu3grp/IO_JY/yu3grp/LVXSCID/patients_scATACseq/multiome_P1' ) bam_file = './03_chimeric/P1_scMulti_ATAC_S1_pe.mated.filter.bam' out_sam_file = './04_match_CB/P1_scMulti_ATAC_S1_pe.mated.filter_wCB.sam' cellID_file = './04_match_CB/P1_scMulti_ATAC_S1_pe.mated.filter_R2.fastq' #fa = pyfastx.Fastx('./LVX_SCID_P1_S1_L001_pe.mated.filter2.bam_readbarcode') fa = pyfastx.Fastx(cellID_file) barcodes = {} for name, seq, qual, comment in fa: barcodes[name] = seq barcode_tag = 'CB' with simplesam.Reader(open(bam_file)) as in_bam: with simplesam.Writer(open(out_sam_file, 'w'), in_bam.header) as out_sam: for read in in_bam: #read[umi_tag] = barcodes[read.qname][0] read[barcode_tag] = barcodes[read.qname] out_sam.write(read)