コード例 #1
0
def add_corrupt_tree_order(corrupt_tree, metrics, output):
    """
    adds corrupt tree order to metrics
    """

    with open(corrupt_tree) as newickfile:
        newickdata = newickfile.readline()
        assert newickfile.readline() == ''

    tree = Tree(newickdata, format=1)

    leaves = [node.name for node in tree.traverse("levelorder")]
    leaves = [val[len('cell_'):] for val in leaves if val.startswith("cell_")]

    ordering = {val: i for i, val in enumerate(leaves)}

    metrics = csvutils.read_csv_and_yaml(metrics)

    cells = metrics.cell_id

    for cellid in cells:
        order = ordering.get(cellid, float('nan'))
        metrics.loc[metrics["cell_id"] == cellid, "order_corrupt_tree"] = order

    csvutils.write_dataframe_to_csv_and_yaml(metrics,
                                             output,
                                             write_header=True)
コード例 #2
0
def cell_cycle_classifier(hmmcopy_reads, hmmcopy_metrics, alignment_metrics,
                          output, tempdir, genome_labels):
    helpers.makedirs(tempdir)
    temp_output = os.path.join(tempdir, 'cell_cycle_output.csv')

    cmd = [
        'cell_cycle_classifier', 'train-classify', hmmcopy_reads,
        hmmcopy_metrics, alignment_metrics, temp_output
    ]

    pypeliner.commandline.execute(*cmd)

    cell_cycle_df = pd.read_csv(temp_output)

    cols_cell_cycle = cell_cycle_df.columns.values

    hmm_metrics_df = csvutils.read_csv_and_yaml(hmmcopy_metrics)

    hmm_metrics_df = hmm_metrics_df.merge(cell_cycle_df,
                                          on=['cell_id'],
                                          how='outer')

    out_dtypes = dtypes(genome_labels)
    for colname in cols_cell_cycle:
        hmm_metrics_df[colname] = hmm_metrics_df[colname].astype(
            out_dtypes[colname])

    csvutils.write_dataframe_to_csv_and_yaml(hmm_metrics_df, output,
                                             out_dtypes)
コード例 #3
0
ファイル: tasks.py プロジェクト: DouglasAbrams/biowrappers
def get_tri_nucelotide_context(ref_genome_fasta_file, vcf_file, out_file,
                               table_name):
    vcf_reader = vcf.Reader(filename=vcf_file)

    fasta_reader = pysam.Fastafile(ref_genome_fasta_file)

    data = []

    for record in vcf_reader:
        chrom = record.CHROM

        coord = record.POS

        tri_nucleotide_context = fasta_reader.fetch(chrom, coord - 2,
                                                    coord + 1)

        data.append({
            'chrom': record.CHROM,
            'coord': record.POS,
            'tri_nucleotide_context': tri_nucleotide_context
        })

    data = pd.DataFrame(data)

    csvutils.write_dataframe_to_csv_and_yaml(data,
                                             out_file,
                                             data.dtypes.to_dict(),
                                             write_header=True)
コード例 #4
0
def cell_cycle_classifier(hmmcopy_reads,
                          hmmcopy_metrics,
                          alignment_metrics,
                          output,
                          tempdir,
                          docker_image=None):
    helpers.makedirs(tempdir)
    temp_output = os.path.join(tempdir, 'cell_cycle_output.csv')

    cmd = [
        'cell_cycle_classifier', 'train-classify', hmmcopy_reads,
        hmmcopy_metrics, alignment_metrics, temp_output
    ]

    pypeliner.commandline.execute(*cmd, docker_image=docker_image)

    cell_cycle_df = pd.read_csv(temp_output)

    hmm_metrics_df = csvutils.read_csv_and_yaml(hmmcopy_metrics)

    hmm_metrics_df = hmm_metrics_df.merge(cell_cycle_df,
                                          on=['cell_id'],
                                          how='outer')

    csvutils.write_dataframe_to_csv_and_yaml(hmm_metrics_df, output)
コード例 #5
0
def add_contamination_status(infile,
                             outfile,
                             config,
                             reference='grch37',
                             threshold=0.05):
    data = csvutils.read_csv_and_yaml(infile)

    data = data.set_index('cell_id', drop=False)
    organisms = [genome['name'] for genome in config['genomes']]

    if reference not in organisms:
        raise Exception("Could not find the fastq screen counts")

    alts = [col for col in organisms if not col == reference]

    data['is_contaminated'] = False

    for altcol in alts:
        perc_alt = _get_col_data(data, altcol) / data['total_reads']
        data.loc[perc_alt > threshold, 'is_contaminated'] = True

    col_type = dtypes()['metrics']['is_contaminated']

    data['is_contaminated'] = data['is_contaminated'].astype(col_type)
    csvutils.write_dataframe_to_csv_and_yaml(data, outfile,
                                             dtypes()['metrics'])
コード例 #6
0
def convert_hdf_to_csv(h5_input, outputs):

    with pd.HDFStore(h5_input) as h5_data:
        for tablename, outfile in outputs.items():
            df = h5_data[tablename]
            csvutils.write_dataframe_to_csv_and_yaml(df,
                                                     outfile,
                                                     write_header=True)
コード例 #7
0
ファイル: tasks.py プロジェクト: DouglasAbrams/biowrappers
def annotate_db_status(db_vcf_file, target_vcf_file, out_file):
    db_reader = vcf.Reader(filename=db_vcf_file)

    reader = vcf.Reader(filename=target_vcf_file)

    data = []

    for record in reader:
        chrom = record.CHROM

        coord = record.POS

        try:
            db_position_records = [
                x for x in db_reader.fetch(chrom, coord - 1, coord)
            ]

        except ValueError:
            db_position_records = []

        for db_record in db_position_records:

            if (db_record.CHROM != chrom) or (db_record.POS != coord):
                continue

            if db_record.is_indel:
                indel = 1

            else:
                indel = 0

            for alt in record.ALT:

                if (record.REF == db_record.REF) and (alt in db_record.ALT):
                    exact_match = 1

                else:
                    exact_match = 0

                out_row = {
                    'chrom': chrom,
                    'coord': coord,
                    'ref': record.REF,
                    'alt': str(alt),
                    'db_id': db_record.ID,
                    'exact_match': exact_match,
                    'indel': indel
                }

                data.append(out_row)

    data = pd.DataFrame(data)

    csvutils.write_dataframe_to_csv_and_yaml(data,
                                             out_file,
                                             data.dtypes.to_dict(),
                                             write_header=True)
コード例 #8
0
def convert_vcf_to_table(in_file, out_file):
    data = []

    parser = ClassicSnpEffParser(in_file)

    for row in parser:
        data.append(row)

    data = pd.DataFrame(data)

    csvutils.write_dataframe_to_csv_and_yaml(data, out_file, dtypes())
コード例 #9
0
    def write(self, input_df, transpose=False):
        '''
        write the dataframe to output file
        '''
        if transpose:
            del input_df["gc"]
            input_df = input_df.T
            input_df["cell_id"] = input_df.index

        input_df.columns = input_df.columns.astype(str)
        csvutils.write_dataframe_to_csv_and_yaml(input_df, self.output,
                                                 self.dtypes)
コード例 #10
0
def get_mappability_col(reads, annotated_reads):
    reads = csvutils.read_csv_and_yaml(reads, chunksize=100)

    alldata = []
    for read_data in reads:
        read_data['is_low_mappability'] = (read_data['map'] <= 0.9)
        alldata.append(read_data)

    alldata = pd.concat(alldata)

    csvutils.write_dataframe_to_csv_and_yaml(
        alldata, annotated_reads, dtypes()['reads'], write_header=True
    )
コード例 #11
0
    def write_dfs(self, tmpdir, dfs, dtypes, write_heads=True):
        n_dfs = len(dfs)
        names = [
            os.path.join(tmpdir,
                         str(i) + ".csv.gz") for i in range(n_dfs)
        ]

        assert len({n_dfs, len(dtypes)}) == 1

        for i in range(n_dfs):
            csvutils.write_dataframe_to_csv_and_yaml(dfs[i], names[i],
                                                     dtypes[i], write_heads)
        return names
コード例 #12
0
def get_mappability(mappability_file,
                    vcf_file,
                    out_file,
                    region=None,
                    append_chr=True):
    map_reader = BigWigFile(open(mappability_file, 'rb'))

    vcf_reader = vcf.Reader(filename=vcf_file)

    if region is not None:
        chrom, beg, end = parse_region_for_vcf(region)
        try:
            vcf_reader = vcf_reader.fetch(chrom, start=beg, end=end)
        except ValueError:
            print("no data for region {} in vcf".format(region))
            vcf_reader = []

    data = []

    for record in vcf_reader:
        if append_chr:
            chrom = 'chr{0}'.format(record.CHROM)

        else:
            chrom = record.CHROM

        coord = record.POS

        beg = coord - 100

        beg = max(beg, 0)

        end = coord + 100

        result = map_reader.query(chrom, beg, end, 1)

        if result is None:
            mappability = 0

        else:
            mappability = result[0]['mean']

        data.append({
            'chrom': record.CHROM,
            'coord': record.POS,
            'mappability': mappability
        })

    data = pd.DataFrame(data)

    csvutils.write_dataframe_to_csv_and_yaml(data, out_file, dtypes())
コード例 #13
0
def merge_fastq_screen_counts(all_detailed_counts, all_summary_counts,
                              merged_detailed_counts, merged_summary_counts):
    if isinstance(all_detailed_counts, dict):
        all_detailed_counts = all_detailed_counts.values()

    detailed_data = []
    for countsfile in all_detailed_counts:
        if os.stat(countsfile).st_size == 0:
            continue
        detailed_data.append(pd.read_csv(countsfile))

    if len(detailed_data) > 0:
        df = pd.concat(detailed_data)
    else:
        df = pd.DataFrame(
            columns=["cell_id", "readend", "human", "mouse", "count"])
    index_cols = [v for v in df.columns.values if v != "count"]

    df['count'] = df.groupby(index_cols)['count'].transform('sum')

    df = df.drop_duplicates(subset=index_cols)

    csvutils.write_dataframe_to_csv_and_yaml(df,
                                             merged_detailed_counts,
                                             write_header=True,
                                             dtypes=dtypes())

    if isinstance(all_summary_counts, dict):
        all_summary_counts = all_summary_counts.values()

    summary_counts = [
        pd.read_csv(countsfile) for countsfile in all_summary_counts
    ]

    if len(summary_counts) > 0:
        df = pd.concat(summary_counts)
    else:
        df = pd.DataFrame(columns - ["cell_id", "fastqscreen_nohit"])

    update_cols = [v for v in df.columns.values if v != 'cell_id']

    for colname in update_cols:
        df[colname] = df.groupby('cell_id')[colname].transform('sum')

    df = df.drop_duplicates(subset=['cell_id'])

    csvutils.write_dataframe_to_csv_and_yaml(df,
                                             merged_summary_counts,
                                             write_header=True,
                                             dtypes=dtypes())
コード例 #14
0
    def test_write_to_csv_yaml_empty(self, tmpdir):
        """
        write empty df
        """

        dtypes = {v: "int" for v in 'ABCD'}
        df = pd.DataFrame()
        filename = os.path.join(tmpdir, "df.csv.gz")
        yaml_filename = filename + ".yaml"

        csvutils.write_dataframe_to_csv_and_yaml(df, filename, dtypes)

        assert os.path.exists(filename)
        assert os.path.exists(yaml_filename)
コード例 #15
0
def genotype(input_bam,
             reference,
             input_vcf,
             output_vcf,
             output_csv,
             tempdir,
             cell_id,
             docker_image=None):
    """
    calls svtyper-sso on input
    bam and vcf to perform genotyping.
    :param input_bam:
    :type input_bam:
    :param reference:
    :type reference:
    :param input_vcf:
    :type input_vcf:
    :param output_vcf:
    :type output_vcf:
    :param output_csv:
    :type output_csv:
    :param tempdir:
    :type tempdir:
    :param docker_image:
    :type docker_image:
    :return:
    :rtype:
    """
    helpers.makedirs(tempdir)

    cmd = [
        'svtyper-sso', '--input_vcf', input_vcf, '--bam', input_bam,
        '--ref_fasta', reference, '-o', output_vcf
    ]

    pypeliner.commandline.execute(*cmd, docker_image=docker_image)

    base_data = parse_vcf(output_vcf, None, return_pandas=True)

    svtype_annotations = extract_svtyper_info(base_data)

    base_data = base_data.iloc[:, :-2]  # assumes svtyper info in last 2 cols

    output = pd.concat([base_data, svtype_annotations], axis=1)

    output['cell_id'] = cell_id

    csvutils.write_dataframe_to_csv_and_yaml(output,
                                             output_csv,
                                             write_header=True)
コード例 #16
0
def annotate_metrics(metrics, output, sample_info, cells):
    """
    adds sample information to metrics in place
    """

    metrics = csvutils.read_csv_and_yaml(metrics)

    for cellid in cells:
        cellinfo = sample_info[cellid]

        for colname, value in cellinfo.items():
            metrics.loc[metrics["cell_id"] == cellid, colname] = value

    csvutils.write_dataframe_to_csv_and_yaml(metrics, output)
コード例 #17
0
def test_contamination(tmpdir):
    data = {}

    cols = [
        'fastqscreen_nohit',
        'fastqscreen_grch37',
        'fastqscreen_grch37_multihit',
        'fastqscreen_mm10',
        'fastqscreen_mm10_multihit',
        'fastqscreen_salmon',
        'fastqscreen_salmon_multihit'
    ]

    for i in range(5):
        data[i] = {'cell_id': 'SA123_A123_R{0}_C{0}'.format(i)}
        for col in cols:
            data[i][col] = i * 10
        data[i]['fastqscreen_grch37'] = i * 1000
        data[i]['fastqscreen_mm10'] = i * 100

    for i in range(5, 10):
        data[i] = {'cell_id': 'SA123_A123_R{0}_C{0}'.format(i)}
        for col in cols:
            data[i][col] = (i * 10)
        data[i]['fastqscreen_grch37'] = i * 1000

    data = pd.DataFrame.from_dict(data, orient='index')
    data['total_reads'] = data[cols].sum(axis=1)

    dtypes = {col: 'int' for col in cols}
    dtypes['cell_id'] = 'str'
    dtypes['total_reads'] = 'int'

    infile = os.path.join(tmpdir, 'input.csv.gz')
    outfile = os.path.join(tmpdir, 'output.csv.gz')

    csvutils.write_dataframe_to_csv_and_yaml(data, infile, dtypes)

    config = {'genomes': [{'name': 'grch37'}, {'name': 'mm10'}, {'name': 'salmon'}]}

    tasks.add_contamination_status(infile, outfile, config)

    output = csvutils.read_csv_and_yaml(outfile)

    assert output['is_contaminated'].tolist() == [False] + [True] * 4 + [False] * 5
コード例 #18
0
def merge_fastq_screen_counts(all_detailed_counts, all_summary_counts,
                              merged_detailed_counts, merged_summary_counts,
                              fastqscreen_config):
    genome_labels = [
        genome['name'] for genome in fastqscreen_config['genomes']
    ]

    all_detailed_counts = helpers.flatten(all_detailed_counts)
    all_detailed_counts = [
        pd.read_csv(file) for file in all_detailed_counts
        if not helpers.is_empty(file)
    ]
    df = pd.concat(all_detailed_counts)

    index_cols = [v for v in df.columns.values if v != "count"]

    df['count'] = df.groupby(index_cols)['count'].transform('sum')

    df = df.drop_duplicates(subset=index_cols)

    csvutils.write_dataframe_to_csv_and_yaml(
        df,
        merged_detailed_counts,
        fastqscreen_dtypes(genome_labels)['fastqscreen_detailed'],
        write_header=True)

    all_summary_counts = helpers.flatten(all_summary_counts)
    all_summary_counts = [
        pd.read_csv(file) for file in all_summary_counts
        if not helpers.is_empty(file)
    ]
    df = pd.concat(all_summary_counts)

    update_cols = [v for v in df.columns.values if v != 'cell_id']

    for colname in update_cols:
        df[colname] = df.groupby('cell_id')[colname].transform('sum')

    df = df.drop_duplicates(subset=['cell_id'])

    csvutils.write_dataframe_to_csv_and_yaml(
        df,
        merged_summary_counts,
        fastqscreen_dtypes(genome_labels)['metrics'],
        write_header=True)
コード例 #19
0
def merge_fastq_screen_counts(all_detailed_counts, all_summary_counts,
                              merged_detailed_counts, merged_summary_counts):
    if isinstance(all_detailed_counts, dict):
        all_detailed_counts = all_detailed_counts.values()

    detailed_data = []
    for countsfile in all_detailed_counts:
        if os.stat(countsfile).st_size == 0:
            continue
        detailed_data.append(pd.read_csv(countsfile))

    df = pd.concat(detailed_data)

    index_cols = [v for v in df.columns.values if v != "count"]

    df['count'] = df.groupby(index_cols)['count'].transform('sum')

    df = df.drop_duplicates(subset=index_cols)

    csvutils.write_dataframe_to_csv_and_yaml(df,
                                             merged_detailed_counts,
                                             dtypes()['fastqscreen_detailed'],
                                             write_header=True)

    if isinstance(all_summary_counts, dict):
        all_summary_counts = all_summary_counts.values()

    summary_counts = [
        pd.read_csv(countsfile) for countsfile in all_summary_counts
    ]

    df = pd.concat(summary_counts)

    update_cols = [v for v in df.columns.values if v != 'cell_id']

    for colname in update_cols:
        df[colname] = df.groupby('cell_id')[colname].transform('sum')

    df = df.drop_duplicates(subset=['cell_id'])

    csvutils.write_dataframe_to_csv_and_yaml(df,
                                             merged_summary_counts,
                                             dtypes()['metrics'],
                                             write_header=True)
コード例 #20
0
    def base_write_to_csv_yaml_test(self, temp, dtypes, length, write_header=True):
        """
        base test for write csv yaml
        """
        df = self.make_test_dfs([dtypes], length)

        csv = self.write_dfs(temp, df, [dtypes], write_header)

        filename = csv[0]

        yaml_filename = filename + ".yaml"
        os.remove(yaml_filename)

        assert not os.path.exists(yaml_filename)

        csvutils.write_dataframe_to_csv_and_yaml(df[0], filename, dtypes,
                                                 write_header=write_header)

        return df[0], filename, yaml_filename
コード例 #21
0
def add_contamination_status(infile,
                             outfile,
                             reference='grch37',
                             ref_threshold=0.6,
                             alt_threshold=0.2,
                             strict_validation=True):
    data = csvutils.read_csv_and_yaml(infile)

    data = data.set_index('cell_id', drop=False)

    fastqscreen_cols = [
        col for col in data.columns.values if col.startswith('fastqscreen_')
    ]

    reference = "fastqscreen_{}".format(reference)
    if reference not in fastqscreen_cols:
        raise Exception("Could not find the fastq screen counts")

    alts = [col for col in fastqscreen_cols if not col == reference]

    data['is_contaminated'] = False

    perc_ref = data[reference] / data['total_reads']
    data.loc[perc_ref <= ref_threshold, 'is_contaminated'] = True

    for altcol in alts:
        perc_alt = data[altcol] / data['total_reads']
        data.loc[perc_alt > alt_threshold, 'is_contaminated'] = True

    col_type = dtypes()['metrics']['is_contaminated']
    data['is_contaminated'] = data['is_contaminated'].astype(col_type)

    csvutils.write_dataframe_to_csv_and_yaml(data,
                                             outfile,
                                             write_header=True,
                                             dtypes=dtypes()['metrics'])

    # get cells that are contaminated and have enopugh human reads
    check_df = data.loc[data['is_contaminated'] == True]
    check_df['perc_ref'] = data[reference] / data['total_reads']
    check_df = check_df[check_df['perc_ref'] > ref_threshold]
    if strict_validation and (len(check_df) / len(data) > 0.2):
        logging.error("over 20% of cells are contaminated")
コード例 #22
0
ファイル: tasks.py プロジェクト: DouglasAbrams/biowrappers
def convert_vcf_to_table(in_file, out_file, table_name, classic_mode=True):
    data = []

    if classic_mode:
        parser = biowrappers.components.variant_calling.snpeff.parser.ClassicSnpEffParser(
            in_file)

    else:
        parser = biowrappers.components.variant_calling.snpeff.parser.SnpEffParser(
            in_file)

    for row in parser:
        data.append(row)

    data = pd.DataFrame(data)

    csvutils.write_dataframe_to_csv_and_yaml(data,
                                             out_file,
                                             data.dtypes.to_dict(),
                                             write_header=True)
コード例 #23
0
    def test_concat_csv_with_nans(self, tmpdir, n_rows):
        """
        concat two csvs with NaNs
        """
        dtypes = {v: "float" for v in 'ABCD'}

        concatenated = os.path.join(tmpdir, 'concat.csv.gz')

        dfs = self.make_test_dfs([dtypes, dtypes], n_rows)
        csvs = [os.path.join(tmpdir, "0.csv.gz"),
                os.path.join(tmpdir, "1.csv.gz")]

        dfs[0].iloc[2, dfs[0].columns.get_loc("A")] = np.NaN
        dfs[1].iloc[2, dfs[1].columns.get_loc("A")] = np.NaN

        csvutils.write_dataframe_to_csv_and_yaml(dfs[0], csvs[0], dtypes)
        csvutils.write_dataframe_to_csv_and_yaml(dfs[1], csvs[1], dtypes)

        ref = pd.concat(dfs, ignore_index=True)
        csvutils.concatenate_csv(csvs, concatenated)

        assert self.dfs_exact_match(ref, concatenated)
コード例 #24
0
def classify_fastqscreen(training_data_path, metrics_path, metrics_output, dtypes):
    df = csvutils.read_csv_and_yaml(metrics_path)
    features_train, feature_transformer, model = train(training_data_path)

    features = ["fastqscreen_nohit_ratio", "fastqscreen_grch37_ratio", "fastqscreen_mm10_ratio",
                "fastqscreen_salmon_ratio"]
    label_to_species = {0: "grch37", 1: "mm10", 2: "salmon"}
    # check if all the features exists, if yes, make predictions, else create an empty species column.
    exist = all([feature[:-6] in df for feature in features])
    if exist:
        # make the feature columns
        for feature in features:
            df[feature] = df[feature[:-6]].divide(df["total_reads"])
        # check if there's any missing value
        feature_test = df[features]
        feature_test = feature_test.replace([np.inf, -np.inf], np.nan)
        feature_test.fillna(features_train.mean(), inplace=True)
        # scale the features
        scaled_features = feature_transformer.transform(feature_test)
        df["species"] = model.predict(scaled_features)
        df["species"].replace(label_to_species, inplace=True)
    csvutils.write_dataframe_to_csv_and_yaml(df, metrics_output, dtypes)
コード例 #25
0
def add_contamination_status(infile,
                             outfile,
                             genome_labels,
                             reference='grch37',
                             threshold=0.05):
    data = csvutils.read_csv_and_yaml(infile)

    data = data.set_index('cell_id', drop=False)

    if reference not in genome_labels:
        raise Exception("Could not find the fastq screen counts")

    alts = [col for col in genome_labels if not col == reference]

    data['is_contaminated'] = False

    for altcol in alts:
        perc_alt = _get_col_data(data, altcol) / data['total_reads']
        data.loc[perc_alt > threshold, 'is_contaminated'] = True

    data['is_contaminated'] = data['is_contaminated'].astype('bool')
    csvutils.write_dataframe_to_csv_and_yaml(data, outfile,
                                             dtypes(genome_labels))
コード例 #26
0
def get_snv_allele_counts_for_vcf_targets(bam_file,
                                          vcf_file,
                                          out_file,
                                          count_duplicates=False,
                                          min_bqual=0,
                                          min_mqual=0,
                                          region=None,
                                          vcf_to_bam_chrom_map=None,
                                          report_zero_count_positions=False,
                                          dtypes=None,
                                          **extra_columns):

    bam = pysam.AlignmentFile(bam_file, 'rb')

    vcf_reader = vcf.Reader(filename=vcf_file)

    if region is not None:
        chrom, beg, end = utils.parse_region_for_vcf(region)

        try:
            vcf_reader = vcf_reader.fetch(chrom, start=beg, end=end)

        except ValueError:
            vcf_reader = ()

    data = []

    for record in vcf_reader:
        if vcf_to_bam_chrom_map is not None:
            bam_chrom = vcf_to_bam_chrom_map[record.CHROM]

        else:
            bam_chrom = record.CHROM

        df = _get_counts_df(
            bam,
            bam_chrom,
            record.POS,
            record.POS + 1,
            count_duplicates=count_duplicates,
            min_bqual=min_bqual,
            min_mqual=min_mqual,
            strand='both',
            report_zero_count_positions=report_zero_count_positions,
        )

        if df is None:
            continue

        counts = df.iloc[0]

        ref_base = record.REF

        # Skip record with reference base == N
        if ref_base not in nucleotides:
            continue

        for alt_base in record.ALT:
            alt_base = str(alt_base)

            if (len(ref_base) != 1) or (len(alt_base) != 1):
                continue

            # Skip record with alt base == N
            if alt_base not in nucleotides:
                continue

            if not report_zero_count_positions and counts[
                    ref_base] == 0 and counts[alt_base] == 0:
                continue

            # Format output record
            out_row = {
                'chrom': record.CHROM,
                'coord': record.POS,
                'ref': ref_base,
                'alt': alt_base,
                'ref_counts': counts[ref_base],
                'alt_counts': counts[alt_base]
            }

            data.append(out_row)

    data = pd.DataFrame(
        data,
        columns=['chrom', 'coord', 'ref', 'alt', 'ref_counts', 'alt_counts'])

    for col, value in extra_columns.items():
        data[col] = value

    csvutils.write_dataframe_to_csv_and_yaml(data, out_file, dtypes)