Example #1
0
def get_sample_ids(vcf_path,
                   provided_t_name=None,
                   provided_n_name=None,
                   provided_r_name=None,
                   return_names=False):
    t_ids, n_ids, r_ids = [], [], []
    t_names, n_names, r_names = [], [], []

    from cyvcf2 import VCF
    vcf_samples = VCF(vcf_path).samples

    if provided_t_name:
        for sname in provided_t_name.split(','):
            assert sname in vcf_samples,\
                f'Tumor sample name {sname} is not in VCF {vcf_path}. Found: {vcf_samples}'
            t_names.append(sname)
    if provided_n_name:
        for sname in provided_n_name.split(','):
            assert sname in vcf_samples,\
                f'Normal sample name {sname} is not in VCF {vcf_path}. Found: {vcf_samples}'
            n_names.append(sname)
    if provided_r_name:
        for sname in provided_r_name.split(','):
            assert sname in vcf_samples,\
                f'RNA sample name {sname} is not in VCF {vcf_path}. Found: {vcf_samples}'
            r_names.append(sname)

    if len(vcf_samples) == 1:
        t_names = [vcf_samples[0]]
        t_ids = [0]
    else:
        guessed_t_name, guessed_n_name = guess_sample_names(vcf_path)
        if not t_names:
            if not guessed_t_name:
                critical(
                    f'Can\'t guess tumor sample name from the VCF {vcf_path}')
            t_names = [guessed_t_name]
        if not n_names:
            if guessed_n_name:
                n_names = [guessed_n_name]
            else:
                if t_names:
                    n_names = [
                        s for s in vcf_samples
                        if s not in t_names and s not in r_names
                    ]
                    if not n_names:
                        critical(
                            f'Can\'t guess normal sample name from the VCF {vcf_path}'
                        )
                else:
                    critical(
                        f'Can\'t guess normal sample name from the VCF {vcf_path}'
                    )

    if t_names:
        assert set(t_names) & set(
            vcf_samples), f't_names: {t_names}, vcf_samples: {vcf_samples}'
        t_ids = [vcf_samples.index(tn) for tn in t_names]
    if n_names:
        assert set(n_names) & set(
            vcf_samples), f'n_names: {n_names}, vcf_samples: {vcf_samples}'
        n_ids = [vcf_samples.index(nn) for nn in n_names]
    if r_names:
        assert set(r_names) & set(
            vcf_samples), f'r_names: {r_names}, vcf_samples: {vcf_samples}'
        r_ids = [vcf_samples.index(rn) for rn in r_names]

    if return_names:
        ret = t_names[0] if len(t_names) == 1 else t_names, \
              n_names[0] if len(n_names) == 1 else n_names
        if r_names:
            ret += (r_names[0] if len(r_names) == 1 else r_names, )
    else:
        ret = t_ids[0] if len(t_names) == 1 else t_ids, \
              n_ids[0] if len(n_names) == 1 else n_ids
        if r_names:
            ret += (r_ids[0] if len(r_ids) == 1 else r_ids, )
    return ret