Ejemplo n.º 1
0
    def write_csv_with_header(self, infile):
        with helpers.getFileHandle(self.filepath, 'wt') as writer:
            with helpers.getFileHandle(infile) as reader:
                writer.write(self.header_line)
                self.write_csv_data(reader, writer)

        self.__write_yaml()
Ejemplo n.º 2
0
    def write_headerless_csv(self, infile):
        with helpers.getFileHandle(self.filepath, 'wt') as writer:
            with helpers.getFileHandle(infile) as reader:
                if not reader.readline() == self.header_line:
                    raise CsvWriterError("cannot write, wrong header")
                self.write_csv_data(reader, writer)

        self.write_yaml()
Ejemplo n.º 3
0
    def concatenate_files(self, infiles):
        header = self.header_line if self.header else None

        with helpers.getFileHandle(self.filepath, 'wt') as writer:
            if header:
                writer.write(header)
            for infile in infiles:
                with helpers.getFileHandle(infile) as reader:
                    self.write_csv_data(reader, writer)

        self.write_yaml()
Ejemplo n.º 4
0
    def parse_segs(self, segs, metrics):
        """parses hmmcopy segments data
        :param segs: path to hmmcopy segs file
        """
        header_flag, dtypes, columns = csvutils.get_metadata(segs)

        header = {v: i for i, v in enumerate(columns)}

        segs_data = {}

        with helpers.getFileHandle(segs) as segfile:

            if header_flag:
                assert segfile.readline().strip().split(',') == columns

            for row in segfile:
                row = row.strip().split(',')

                chrom = row[header["chr"]]
                start = row[header["start"]]
                end = row[header["end"]]
                cell_id = row[header["cell_id"]]
                state = row[header["state"]]
                # float to handle scientific notation
                segment_length = int(float(end)) - int(float(start)) + 1

                if metrics[cell_id] > self.quality_threshold:
                    continue

                segs_data[cell_id] = [
                    cell_id, chrom, start, end, segment_length, state
                ]
        return segs_data
Ejemplo n.º 5
0
def trim_fastqs(fastq1, fastq2, cell_id, tempdir, adapter, adapter2,
                trimgalore_docker):
    """
    run fastqc on both fastq files
    run trimgalore if needed, copy if not.
    """
    with helpers.getFileHandle(fastq1) as reader:
        if not reader.readline():
            return fastq1, fastq2

    trim1 = os.path.join(tempdir, "fastq_R1_trimmed.fastq.gz")
    trim2 = os.path.join(tempdir, "fastq_R2_trimmed.fastq.gz")

    reports_dir = os.path.join(tempdir, 'fastqc_reports')
    if not os.path.exists(reports_dir):
        helpers.makedirs(reports_dir)

    rep1 = os.path.join(reports_dir, '{}_trimgalore_R1.html'.format(cell_id))
    rep2 = os.path.join(reports_dir, '{}_trimgalore_R2.html'.format(cell_id))
    qcrep1 = os.path.join(reports_dir,
                          '{}_trimgalore_qc_R1.html'.format(cell_id))
    qcrep2 = os.path.join(reports_dir,
                          '{}_trimgalore_qc_R2.html'.format(cell_id))
    qczip1 = os.path.join(reports_dir,
                          '{}_trimgalore_qc_R1.zip'.format(cell_id))
    qczip2 = os.path.join(reports_dir,
                          '{}_trimgalore_qc_R2.zip'.format(cell_id))

    run_tg = RunTrimGalore(fastq1, fastq2, trim1, trim2, 'trim_galore',
                           'cutadapt', tempdir, adapter, adapter2, rep1, rep2,
                           qcrep1, qcrep2, qczip1, qczip2, trimgalore_docker)
    run_tg.run_trimgalore()
    run_tg.gather_outputs()

    return trim1, trim2
Ejemplo n.º 6
0
def write_summary_counts(counts, outfile, cell_id, fastqscreen_params):
    genomes = [genome['name'] for genome in fastqscreen_params['genomes']]

    summary_counts = defaultdict(int)
    for read_end, read_end_counts in counts.items():
        for flags, count in read_end_counts.items():
            hit_orgs = [v[0] for v in flags if v[1] > 0]

            for org in hit_orgs:
                summary_counts[org] += count

            if len(hit_orgs) > 1:
                for org in hit_orgs:
                    summary_counts['{}_multihit'.format(org)] += count
            elif len(hit_orgs) == 0:
                summary_counts['nohit'] += count

    with helpers.getFileHandle(outfile, 'wt') as writer:
        if not summary_counts:
            columns = ['cell_id']
            columns += ['fastqscreen_' + genome for genome in genomes]
            columns += ['fastqscreen_nohit']
            header = ','.join(columns) + '\n'
            writer.write(header)
            return

        keys = sorted(summary_counts.keys())
        header = ['cell_id'] + ['fastqscreen_{}'.format(key) for key in keys]
        header = ','.join(header) + '\n'
        writer.write(header)

        values = [cell_id] + [summary_counts[v] for v in keys]
        values = ','.join(map(str, values)) + '\n'
        writer.write(values)
def run_fastq_screen_paired_end(fastq_r1,
                                fastq_r2,
                                tempdir,
                                params,
                                docker_image=None):
    def get_basename(filepath):
        filepath_base = os.path.basename(filepath)

        if filepath_base.endswith('.fastq.gz'):
            filepath_base = filepath_base[:-len('.fastq.gz')]
        elif filepath_base.endswith('.fq.gz'):
            filepath_base = filepath_base[:-len('.fq.gz')]
        elif filepath_base.endswith('.fastq'):
            filepath_base = filepath_base[:-len('.fastq')]
        elif filepath_base.endswith('.fq'):
            filepath_base = filepath_base[:-len('.fq')]
        else:
            raise Exception('unknown file format. {}'.format(filepath))
        return filepath_base

    basename = get_basename(fastq_r1)
    tagged_fastq_r1 = os.path.join(tempdir,
                                   '{}.tagged.fastq.gz'.format(basename))

    basename = get_basename(fastq_r2)
    tagged_fastq_r2 = os.path.join(tempdir,
                                   '{}.tagged.fastq.gz'.format(basename))

    # fastq screen fails if run on empty files
    with helpers.getFileHandle(fastq_r1) as reader:
        if not reader.readline():
            shutil.copy(fastq_r1, tagged_fastq_r1)
            shutil.copy(fastq_r2, tagged_fastq_r2)
            return tagged_fastq_r1, tagged_fastq_r2

    config = os.path.join(tempdir, 'fastq_screen.config')

    with open(config, 'w') as config_writer:
        for genome in params['genomes']:
            genome_name = genome['name']
            genome_path = genome['path']
            outstr = '\t'.join(['DATABASE', genome_name, genome_path]) + '\n'
            config_writer.write(outstr)

    cmd = [
        'fastq_screen',
        '--aligner',
        params['aligner'],
        '--conf',
        config,
        '--outdir',
        tempdir,
        '--tag',
        fastq_r1,
        fastq_r2,
    ]

    pypeliner.commandline.execute(*cmd, docker_image=docker_image)

    return tagged_fastq_r1, tagged_fastq_r2
def write_detailed_counts(counts, outfile, cell_id, fastqscreen_params):
    header = None

    genomes = [genome['name'] for genome in fastqscreen_params['genomes']]

    with helpers.getFileHandle(outfile, 'wt') as writer:

        for read_end, read_end_counts in counts.items():

            if not read_end_counts and not header:
                outstr = ['cell_id', 'readend'] + genomes + ['count']
                writer.write(','.join(outstr) + '\n')
                header = 1
                continue

            if not header:
                outstr = ['cell_id', 'readend']
                outstr += [v[0] for v in list(read_end_counts.keys())[0]]
                outstr += ['count']
                writer.write(','.join(outstr) + '\n')
                header = 1

            for flags, count in read_end_counts.items():
                outstr = [cell_id, read_end]
                outstr += [v[1] for v in flags]
                outstr += [count]
                writer.write(','.join(map(str, outstr)) + '\n')
def filter_reads(input_r1, input_r2, output_r1, output_r2, reference):
    reader = fastqutils.PairedTaggedFastqReader(input_r1, input_r2)

    with helpers.getFileHandle(output_r1,
                               'wt') as writer_r1, helpers.getFileHandle(
                                   output_r2, 'wt') as writer_r2:
        for read_1, read_2 in reader.filter_read_iterator(reference):

            read_1 = reader.add_tag_to_read_comment(read_1)
            read_2 = reader.add_tag_to_read_comment(read_2)

            for line in read_1:
                writer_r1.write(line)

            for line in read_2:
                writer_r2.write(line)
Ejemplo n.º 10
0
    def read_metrics_csv(self, cndata):
        """
        read the input file
        """

        samples = cndata.index

        data = {}
        numread_data = {}
        reads_per_bin_data = {}

        sepdata = defaultdict(list)
        colordata = {}

        header, dtypes, columns = csvutils.get_metadata(self.metrics)
        idxs = self.build_label_indices(columns)

        color_col = self.color_by_col
        sep_col = self.plot_by_col

        with helpers.getFileHandle(self.metrics) as freader:

            if header:
                assert freader.readline().strip().split(',') == columns

            for line in freader:
                line = line.strip().split(self.sep)

                sample_id = line[idxs['cell_id']]

                # skip samples that are just na or inf
                if sample_id not in samples:
                    continue

                val = line[idxs["mad_neutral_state"]]

                val = float('nan') if val == "NA" else float(val)

                ec = 'all' if sep_col == 'all' else line[idxs[sep_col]]

                cc = line[idxs[color_col]]

                numreads = float(line[idxs['total_mapped_reads_hmmcopy']])

                reads_per_bin = line[idxs['median_hmmcopy_reads_per_bin']]

                reads_per_bin = 0 if reads_per_bin == "NA" else float(
                    reads_per_bin)

                if self.cellcalls and cc not in self.cellcalls:
                    continue

                numread_data[sample_id] = numreads
                data[sample_id] = val
                reads_per_bin_data[sample_id] = reads_per_bin

                colordata[sample_id] = cc
                sepdata[ec].append(sample_id)

        return data, sepdata, colordata, numread_data, reads_per_bin_data
Ejemplo n.º 11
0
 def generate_metadata(self):
     with helpers.getFileHandle(self.filepath) as inputfile:
         header = inputfile.readline().strip()
         sep = self.__detect_sep_from_header(header)
         columns = header.split(sep)
         header = True
         dtypes = self.__generate_dtypes(sep=sep)
         return header, sep, dtypes, columns
Ejemplo n.º 12
0
def re_tag_reads(infile, outfile):
    reader = fastqutils.TaggedFastqReader(infile)

    with helpers.getFileHandle(outfile, 'wt') as writer:

        for read in reader.get_read_iterator():
            read = reader.add_tag_to_read_comment(read)

            for line in read:
                writer.write(line)
Ejemplo n.º 13
0
    def __write_yaml(self):

        yamldata = {'header': self.header, 'sep': self.sep, 'columns': []}

        for column in self.columns:
            data = {'name': column, 'dtype': self.dtypes[column]}
            yamldata['columns'].append(data)

        with helpers.getFileHandle(self.yaml_file, 'wt') as f:
            yaml.safe_dump(yamldata, f, default_flow_style=False)
Ejemplo n.º 14
0
    def read_segs_csv(self):
        """
        read the input file
        """
        data = {}

        bins = {}

        header, dtypes, columns = csvutils.get_metadata(self.input)

        with helpers.getFileHandle(self.input, 'rt') as freader:
            idxs = self.build_label_indices(columns)

            if header:
                assert freader.readline().strip().split(',') == columns

            for line in freader:
                line = line.strip().split(self.sep)

                sample_id = line[idxs['cell_id']]

                val = line[idxs[self.column_name]]

                val = float('nan') if val == "NA" else float(val)

                chrom = line[idxs['chr']]
                start = int(line[idxs['start']])
                end = int(line[idxs['end']])

                seg = (chrom, start, end)

                if self.mappability_threshold and float(
                        line[idxs["map"]]) <= self.mappability_threshold:
                    val = float("nan")

                if chrom not in bins:
                    bins[chrom] = set()
                bins[chrom].add((start, end))

                # just a sanity check, not required
                if sample_id in data and seg in data[sample_id]:
                    raise Exception("repeated val")

                if sample_id not in data:
                    data[sample_id] = {}

                data[sample_id][seg] = val

            samples = sorted(data.keys())
            bins = self.sort_bins_csv(bins)

            data = self.conv_to_matrix(data, bins, samples)
            data = self.get_pandas_dataframe(data, bins)

        return data
Ejemplo n.º 15
0
def merge_fastqs(inputs, output):
    read_counter = 0
    with helpers.getFileHandle(output, 'wt') as merged:
        for cellid in inputs:
            infile = inputs[cellid]
            reader = fastqutils.FastqReader(infile)
            for read in reader.get_read_iterator():
                read[0] = '@' + str(
                    int(read_counter)) + '/' + read[0].split('/')[1]
                for line in read:
                    merged.write(line)
                read_counter += 1
Ejemplo n.º 16
0
def annotate_ref_alt(haps_csv, refdir, output_csv):
    thousand_genomes = os.path.join(refdir, 'thousand_genomes_snps.tsv')

    annotation_data = {}

    with helpers.getFileHandle(thousand_genomes, 'rt') as db:
        for line in db:
            line = line.strip().split('\t')

            chrom, pos, ref, alt = line

            annotation_data[(chrom, pos)] = (ref, alt)

    with helpers.getFileHandle(haps_csv,
                               'rt') as reader, helpers.getFileHandle(
                                   output_csv, 'wt') as writer:

        header = reader.readline().strip()
        header += '\tref\talt\n'
        writer.write(header)

        for line in reader:
            line = line.strip()
            l_split = line.split('\t')

            chrom = l_split[0]
            pos = l_split[1]

            if (chrom, pos) in annotation_data:
                ref, alt = annotation_data[(chrom, pos)]
            else:
                ref = 'NA'
                alt = 'NA'

            line += '\t{}\t{}\n'.format(ref, alt)

            writer.write(line)
Ejemplo n.º 17
0
def re_index_reads(input_fastq,
                   output_fastq,
                   cell_id,
                   cells,
                   cell_read_counts,
                   tag=False):
    start_count = get_start_count(cells, cell_read_counts, cell_id)

    with helpers.getFileHandle(input_fastq) as infile:
        with helpers.getFileHandle(output_fastq, 'wt') as outfile:

            while True:
                fastq_read = list(islice(infile, 4))

                if not fastq_read:
                    break

                assert len(fastq_read) == 4, 'fastq file format error'

                if not fastq_read[0].startswith('@'):
                    raise ValueError(
                        'Expected @ as first character of read name')

                if not fastq_read[2].startswith('+'):
                    raise ValueError(
                        'Expected = as first character of read comment')

                fastq_read[0] = '@' + str(
                    int(start_count)) + '/' + fastq_read[0].split('/')[1]

                start_count += 1

                if tag:
                    fastq_read[2] = '+' + cell_id + "\n"

                for fastq_line in fastq_read:
                    outfile.write(fastq_line)
Ejemplo n.º 18
0
def filter_tag_reads(input_r1, input_r2, output_r1, output_r2, params):
    genomes = [v['name'] for v in params['genomes']]

    if not params['filter_tags']:
        filter_tags = set()
    else:
        filter_tags = set(params['filter_tags'])

    reader = fastqutils.PairedTaggedFastqReader(input_r1, input_r2)

    with helpers.getFileHandle(output_r1,
                               'wt') as writer_r1, helpers.getFileHandle(
                                   output_r2, 'wt') as writer_r2:
        for read_1, read_2 in reader.filter_read_iterator(
                genomes, filter_tags):

            read_1 = reader.add_tag_to_read_comment(read_1)
            read_2 = reader.add_tag_to_read_comment(read_2)

            for line in read_1:
                writer_r1.write(line)

            for line in read_2:
                writer_r2.write(line)
Ejemplo n.º 19
0
def get_read_count(input_fastq):
    def blocks(files, size=65536):
        while True:
            b = files.read(size)
            if not b:
                break
            yield b

    with helpers.getFileHandle(input_fastq) as indata:
        linecount = sum(bl.count("\n") for bl in blocks(indata))

    assert linecount % 4 == 0

    readcount = linecount / 4

    return int(readcount)
Ejemplo n.º 20
0
    def __parse_metadata(self):
        with helpers.getFileHandle(self.filepath + '.yaml') as yamlfile:
            yamldata = yaml.safe_load(yamlfile)

        header = yamldata['header']
        sep = yamldata.get('sep', ',')

        dtypes = {}
        columns = []
        for coldata in yamldata['columns']:
            colname = coldata['name']

            dtypes[colname] = coldata['dtype']

            columns.append(colname)

        return header, sep, dtypes, columns
Ejemplo n.º 21
0
    def get_read_iterator(self):
        with helpers.getFileHandle(self.file_path) as fq_reader:
            while True:
                fastq_read = list(islice(fq_reader, 4))

                fastq_read = [line for line in fastq_read]

                if not fastq_read:
                    break

                assert len(fastq_read) == 4, 'fastq file format error'

                if not fastq_read[0].startswith('@'):
                    raise ValueError('Expected @ as first character of read name')

                if not fastq_read[2].startswith('+'):
                    raise ValueError('Expected = as first character of read comment')

                yield fastq_read
Ejemplo n.º 22
0
    def check_empty_file(self, path):
        """checks if file is empty
        :param path: path to the file
        :returns bool: true if file is empty, False o.w.
        """

        if not os.path.exists(path):
            raise IOError("Input file %s missing" % path)

        if os.stat(path).st_size == 0:
            return True

        with helpers.getFileHandle(path) as infile:
            # header line
            _ = infile.readline()
            # data?
            data = infile.readline()
            if not data:
                return True

        return False
Ejemplo n.º 23
0
def write_detailed_counts(counts, outfile, cell_id):
    header = None

    with helpers.getFileHandle(outfile, 'wt') as writer:

        for read_end, read_end_counts in counts.items():

            if not read_end_counts:
                continue

            if not header:
                outstr = ['cell_id', 'readend']
                outstr += [v[0] for v in list(read_end_counts.keys())[0]]
                outstr += ['count']
                writer.write(','.join(outstr) + '\n')
                header = 1

            for flags, count in read_end_counts.items():
                outstr = [cell_id, read_end]
                outstr += [v[1] for v in flags]
                outstr += [count]
                writer.write(','.join(map(str, outstr)) + '\n')
Ejemplo n.º 24
0
def write_summary_counts(counts, outfile, cell_id):
    summary_counts = defaultdict(int)
    for read_end, read_end_counts in counts.items():
        for flags, count in read_end_counts.items():
            hit_orgs = [v[0] for v in flags if v[1] > 0]

            for org in hit_orgs:
                summary_counts[org] += count

            if len(hit_orgs) > 1:
                for org in hit_orgs:
                    summary_counts['{}_multihit'.format(org)] += count
            elif len(hit_orgs) == 0:
                summary_counts['nohit'] += count

    with helpers.getFileHandle(outfile, 'wt') as writer:
        keys = sorted(summary_counts.keys())
        header = ['cell_id'] + ['fastqscreen_{}'.format(key) for key in keys]
        header = ','.join(header) + '\n'
        writer.write(header)

        values = [cell_id] + [summary_counts[v] for v in keys]
        values = ','.join(map(str, values)) + '\n'
        writer.write(values)
Ejemplo n.º 25
0
 def __detect_sep_from_file(self):
     with helpers.getFileHandle(self.filepath) as reader:
         header = reader.readline().strip()
         return self.__detect_sep_from_header(header)