def write_csv_with_header(self, infile): with helpers.getFileHandle(self.filepath, 'wt') as writer: with helpers.getFileHandle(infile) as reader: writer.write(self.header_line) self.write_csv_data(reader, writer) self.__write_yaml()
def write_headerless_csv(self, infile): with helpers.getFileHandle(self.filepath, 'wt') as writer: with helpers.getFileHandle(infile) as reader: if not reader.readline() == self.header_line: raise CsvWriterError("cannot write, wrong header") self.write_csv_data(reader, writer) self.write_yaml()
def concatenate_files(self, infiles): header = self.header_line if self.header else None with helpers.getFileHandle(self.filepath, 'wt') as writer: if header: writer.write(header) for infile in infiles: with helpers.getFileHandle(infile) as reader: self.write_csv_data(reader, writer) self.write_yaml()
def parse_segs(self, segs, metrics): """parses hmmcopy segments data :param segs: path to hmmcopy segs file """ header_flag, dtypes, columns = csvutils.get_metadata(segs) header = {v: i for i, v in enumerate(columns)} segs_data = {} with helpers.getFileHandle(segs) as segfile: if header_flag: assert segfile.readline().strip().split(',') == columns for row in segfile: row = row.strip().split(',') chrom = row[header["chr"]] start = row[header["start"]] end = row[header["end"]] cell_id = row[header["cell_id"]] state = row[header["state"]] # float to handle scientific notation segment_length = int(float(end)) - int(float(start)) + 1 if metrics[cell_id] > self.quality_threshold: continue segs_data[cell_id] = [ cell_id, chrom, start, end, segment_length, state ] return segs_data
def trim_fastqs(fastq1, fastq2, cell_id, tempdir, adapter, adapter2, trimgalore_docker): """ run fastqc on both fastq files run trimgalore if needed, copy if not. """ with helpers.getFileHandle(fastq1) as reader: if not reader.readline(): return fastq1, fastq2 trim1 = os.path.join(tempdir, "fastq_R1_trimmed.fastq.gz") trim2 = os.path.join(tempdir, "fastq_R2_trimmed.fastq.gz") reports_dir = os.path.join(tempdir, 'fastqc_reports') if not os.path.exists(reports_dir): helpers.makedirs(reports_dir) rep1 = os.path.join(reports_dir, '{}_trimgalore_R1.html'.format(cell_id)) rep2 = os.path.join(reports_dir, '{}_trimgalore_R2.html'.format(cell_id)) qcrep1 = os.path.join(reports_dir, '{}_trimgalore_qc_R1.html'.format(cell_id)) qcrep2 = os.path.join(reports_dir, '{}_trimgalore_qc_R2.html'.format(cell_id)) qczip1 = os.path.join(reports_dir, '{}_trimgalore_qc_R1.zip'.format(cell_id)) qczip2 = os.path.join(reports_dir, '{}_trimgalore_qc_R2.zip'.format(cell_id)) run_tg = RunTrimGalore(fastq1, fastq2, trim1, trim2, 'trim_galore', 'cutadapt', tempdir, adapter, adapter2, rep1, rep2, qcrep1, qcrep2, qczip1, qczip2, trimgalore_docker) run_tg.run_trimgalore() run_tg.gather_outputs() return trim1, trim2
def write_summary_counts(counts, outfile, cell_id, fastqscreen_params): genomes = [genome['name'] for genome in fastqscreen_params['genomes']] summary_counts = defaultdict(int) for read_end, read_end_counts in counts.items(): for flags, count in read_end_counts.items(): hit_orgs = [v[0] for v in flags if v[1] > 0] for org in hit_orgs: summary_counts[org] += count if len(hit_orgs) > 1: for org in hit_orgs: summary_counts['{}_multihit'.format(org)] += count elif len(hit_orgs) == 0: summary_counts['nohit'] += count with helpers.getFileHandle(outfile, 'wt') as writer: if not summary_counts: columns = ['cell_id'] columns += ['fastqscreen_' + genome for genome in genomes] columns += ['fastqscreen_nohit'] header = ','.join(columns) + '\n' writer.write(header) return keys = sorted(summary_counts.keys()) header = ['cell_id'] + ['fastqscreen_{}'.format(key) for key in keys] header = ','.join(header) + '\n' writer.write(header) values = [cell_id] + [summary_counts[v] for v in keys] values = ','.join(map(str, values)) + '\n' writer.write(values)
def run_fastq_screen_paired_end(fastq_r1, fastq_r2, tempdir, params, docker_image=None): def get_basename(filepath): filepath_base = os.path.basename(filepath) if filepath_base.endswith('.fastq.gz'): filepath_base = filepath_base[:-len('.fastq.gz')] elif filepath_base.endswith('.fq.gz'): filepath_base = filepath_base[:-len('.fq.gz')] elif filepath_base.endswith('.fastq'): filepath_base = filepath_base[:-len('.fastq')] elif filepath_base.endswith('.fq'): filepath_base = filepath_base[:-len('.fq')] else: raise Exception('unknown file format. {}'.format(filepath)) return filepath_base basename = get_basename(fastq_r1) tagged_fastq_r1 = os.path.join(tempdir, '{}.tagged.fastq.gz'.format(basename)) basename = get_basename(fastq_r2) tagged_fastq_r2 = os.path.join(tempdir, '{}.tagged.fastq.gz'.format(basename)) # fastq screen fails if run on empty files with helpers.getFileHandle(fastq_r1) as reader: if not reader.readline(): shutil.copy(fastq_r1, tagged_fastq_r1) shutil.copy(fastq_r2, tagged_fastq_r2) return tagged_fastq_r1, tagged_fastq_r2 config = os.path.join(tempdir, 'fastq_screen.config') with open(config, 'w') as config_writer: for genome in params['genomes']: genome_name = genome['name'] genome_path = genome['path'] outstr = '\t'.join(['DATABASE', genome_name, genome_path]) + '\n' config_writer.write(outstr) cmd = [ 'fastq_screen', '--aligner', params['aligner'], '--conf', config, '--outdir', tempdir, '--tag', fastq_r1, fastq_r2, ] pypeliner.commandline.execute(*cmd, docker_image=docker_image) return tagged_fastq_r1, tagged_fastq_r2
def write_detailed_counts(counts, outfile, cell_id, fastqscreen_params): header = None genomes = [genome['name'] for genome in fastqscreen_params['genomes']] with helpers.getFileHandle(outfile, 'wt') as writer: for read_end, read_end_counts in counts.items(): if not read_end_counts and not header: outstr = ['cell_id', 'readend'] + genomes + ['count'] writer.write(','.join(outstr) + '\n') header = 1 continue if not header: outstr = ['cell_id', 'readend'] outstr += [v[0] for v in list(read_end_counts.keys())[0]] outstr += ['count'] writer.write(','.join(outstr) + '\n') header = 1 for flags, count in read_end_counts.items(): outstr = [cell_id, read_end] outstr += [v[1] for v in flags] outstr += [count] writer.write(','.join(map(str, outstr)) + '\n')
def filter_reads(input_r1, input_r2, output_r1, output_r2, reference): reader = fastqutils.PairedTaggedFastqReader(input_r1, input_r2) with helpers.getFileHandle(output_r1, 'wt') as writer_r1, helpers.getFileHandle( output_r2, 'wt') as writer_r2: for read_1, read_2 in reader.filter_read_iterator(reference): read_1 = reader.add_tag_to_read_comment(read_1) read_2 = reader.add_tag_to_read_comment(read_2) for line in read_1: writer_r1.write(line) for line in read_2: writer_r2.write(line)
def read_metrics_csv(self, cndata): """ read the input file """ samples = cndata.index data = {} numread_data = {} reads_per_bin_data = {} sepdata = defaultdict(list) colordata = {} header, dtypes, columns = csvutils.get_metadata(self.metrics) idxs = self.build_label_indices(columns) color_col = self.color_by_col sep_col = self.plot_by_col with helpers.getFileHandle(self.metrics) as freader: if header: assert freader.readline().strip().split(',') == columns for line in freader: line = line.strip().split(self.sep) sample_id = line[idxs['cell_id']] # skip samples that are just na or inf if sample_id not in samples: continue val = line[idxs["mad_neutral_state"]] val = float('nan') if val == "NA" else float(val) ec = 'all' if sep_col == 'all' else line[idxs[sep_col]] cc = line[idxs[color_col]] numreads = float(line[idxs['total_mapped_reads_hmmcopy']]) reads_per_bin = line[idxs['median_hmmcopy_reads_per_bin']] reads_per_bin = 0 if reads_per_bin == "NA" else float( reads_per_bin) if self.cellcalls and cc not in self.cellcalls: continue numread_data[sample_id] = numreads data[sample_id] = val reads_per_bin_data[sample_id] = reads_per_bin colordata[sample_id] = cc sepdata[ec].append(sample_id) return data, sepdata, colordata, numread_data, reads_per_bin_data
def generate_metadata(self): with helpers.getFileHandle(self.filepath) as inputfile: header = inputfile.readline().strip() sep = self.__detect_sep_from_header(header) columns = header.split(sep) header = True dtypes = self.__generate_dtypes(sep=sep) return header, sep, dtypes, columns
def re_tag_reads(infile, outfile): reader = fastqutils.TaggedFastqReader(infile) with helpers.getFileHandle(outfile, 'wt') as writer: for read in reader.get_read_iterator(): read = reader.add_tag_to_read_comment(read) for line in read: writer.write(line)
def __write_yaml(self): yamldata = {'header': self.header, 'sep': self.sep, 'columns': []} for column in self.columns: data = {'name': column, 'dtype': self.dtypes[column]} yamldata['columns'].append(data) with helpers.getFileHandle(self.yaml_file, 'wt') as f: yaml.safe_dump(yamldata, f, default_flow_style=False)
def read_segs_csv(self): """ read the input file """ data = {} bins = {} header, dtypes, columns = csvutils.get_metadata(self.input) with helpers.getFileHandle(self.input, 'rt') as freader: idxs = self.build_label_indices(columns) if header: assert freader.readline().strip().split(',') == columns for line in freader: line = line.strip().split(self.sep) sample_id = line[idxs['cell_id']] val = line[idxs[self.column_name]] val = float('nan') if val == "NA" else float(val) chrom = line[idxs['chr']] start = int(line[idxs['start']]) end = int(line[idxs['end']]) seg = (chrom, start, end) if self.mappability_threshold and float( line[idxs["map"]]) <= self.mappability_threshold: val = float("nan") if chrom not in bins: bins[chrom] = set() bins[chrom].add((start, end)) # just a sanity check, not required if sample_id in data and seg in data[sample_id]: raise Exception("repeated val") if sample_id not in data: data[sample_id] = {} data[sample_id][seg] = val samples = sorted(data.keys()) bins = self.sort_bins_csv(bins) data = self.conv_to_matrix(data, bins, samples) data = self.get_pandas_dataframe(data, bins) return data
def merge_fastqs(inputs, output): read_counter = 0 with helpers.getFileHandle(output, 'wt') as merged: for cellid in inputs: infile = inputs[cellid] reader = fastqutils.FastqReader(infile) for read in reader.get_read_iterator(): read[0] = '@' + str( int(read_counter)) + '/' + read[0].split('/')[1] for line in read: merged.write(line) read_counter += 1
def annotate_ref_alt(haps_csv, refdir, output_csv): thousand_genomes = os.path.join(refdir, 'thousand_genomes_snps.tsv') annotation_data = {} with helpers.getFileHandle(thousand_genomes, 'rt') as db: for line in db: line = line.strip().split('\t') chrom, pos, ref, alt = line annotation_data[(chrom, pos)] = (ref, alt) with helpers.getFileHandle(haps_csv, 'rt') as reader, helpers.getFileHandle( output_csv, 'wt') as writer: header = reader.readline().strip() header += '\tref\talt\n' writer.write(header) for line in reader: line = line.strip() l_split = line.split('\t') chrom = l_split[0] pos = l_split[1] if (chrom, pos) in annotation_data: ref, alt = annotation_data[(chrom, pos)] else: ref = 'NA' alt = 'NA' line += '\t{}\t{}\n'.format(ref, alt) writer.write(line)
def re_index_reads(input_fastq, output_fastq, cell_id, cells, cell_read_counts, tag=False): start_count = get_start_count(cells, cell_read_counts, cell_id) with helpers.getFileHandle(input_fastq) as infile: with helpers.getFileHandle(output_fastq, 'wt') as outfile: while True: fastq_read = list(islice(infile, 4)) if not fastq_read: break assert len(fastq_read) == 4, 'fastq file format error' if not fastq_read[0].startswith('@'): raise ValueError( 'Expected @ as first character of read name') if not fastq_read[2].startswith('+'): raise ValueError( 'Expected = as first character of read comment') fastq_read[0] = '@' + str( int(start_count)) + '/' + fastq_read[0].split('/')[1] start_count += 1 if tag: fastq_read[2] = '+' + cell_id + "\n" for fastq_line in fastq_read: outfile.write(fastq_line)
def filter_tag_reads(input_r1, input_r2, output_r1, output_r2, params): genomes = [v['name'] for v in params['genomes']] if not params['filter_tags']: filter_tags = set() else: filter_tags = set(params['filter_tags']) reader = fastqutils.PairedTaggedFastqReader(input_r1, input_r2) with helpers.getFileHandle(output_r1, 'wt') as writer_r1, helpers.getFileHandle( output_r2, 'wt') as writer_r2: for read_1, read_2 in reader.filter_read_iterator( genomes, filter_tags): read_1 = reader.add_tag_to_read_comment(read_1) read_2 = reader.add_tag_to_read_comment(read_2) for line in read_1: writer_r1.write(line) for line in read_2: writer_r2.write(line)
def get_read_count(input_fastq): def blocks(files, size=65536): while True: b = files.read(size) if not b: break yield b with helpers.getFileHandle(input_fastq) as indata: linecount = sum(bl.count("\n") for bl in blocks(indata)) assert linecount % 4 == 0 readcount = linecount / 4 return int(readcount)
def __parse_metadata(self): with helpers.getFileHandle(self.filepath + '.yaml') as yamlfile: yamldata = yaml.safe_load(yamlfile) header = yamldata['header'] sep = yamldata.get('sep', ',') dtypes = {} columns = [] for coldata in yamldata['columns']: colname = coldata['name'] dtypes[colname] = coldata['dtype'] columns.append(colname) return header, sep, dtypes, columns
def get_read_iterator(self): with helpers.getFileHandle(self.file_path) as fq_reader: while True: fastq_read = list(islice(fq_reader, 4)) fastq_read = [line for line in fastq_read] if not fastq_read: break assert len(fastq_read) == 4, 'fastq file format error' if not fastq_read[0].startswith('@'): raise ValueError('Expected @ as first character of read name') if not fastq_read[2].startswith('+'): raise ValueError('Expected = as first character of read comment') yield fastq_read
def check_empty_file(self, path): """checks if file is empty :param path: path to the file :returns bool: true if file is empty, False o.w. """ if not os.path.exists(path): raise IOError("Input file %s missing" % path) if os.stat(path).st_size == 0: return True with helpers.getFileHandle(path) as infile: # header line _ = infile.readline() # data? data = infile.readline() if not data: return True return False
def write_detailed_counts(counts, outfile, cell_id): header = None with helpers.getFileHandle(outfile, 'wt') as writer: for read_end, read_end_counts in counts.items(): if not read_end_counts: continue if not header: outstr = ['cell_id', 'readend'] outstr += [v[0] for v in list(read_end_counts.keys())[0]] outstr += ['count'] writer.write(','.join(outstr) + '\n') header = 1 for flags, count in read_end_counts.items(): outstr = [cell_id, read_end] outstr += [v[1] for v in flags] outstr += [count] writer.write(','.join(map(str, outstr)) + '\n')
def write_summary_counts(counts, outfile, cell_id): summary_counts = defaultdict(int) for read_end, read_end_counts in counts.items(): for flags, count in read_end_counts.items(): hit_orgs = [v[0] for v in flags if v[1] > 0] for org in hit_orgs: summary_counts[org] += count if len(hit_orgs) > 1: for org in hit_orgs: summary_counts['{}_multihit'.format(org)] += count elif len(hit_orgs) == 0: summary_counts['nohit'] += count with helpers.getFileHandle(outfile, 'wt') as writer: keys = sorted(summary_counts.keys()) header = ['cell_id'] + ['fastqscreen_{}'.format(key) for key in keys] header = ','.join(header) + '\n' writer.write(header) values = [cell_id] + [summary_counts[v] for v in keys] values = ','.join(map(str, values)) + '\n' writer.write(values)
def __detect_sep_from_file(self): with helpers.getFileHandle(self.filepath) as reader: header = reader.readline().strip() return self.__detect_sep_from_header(header)