コード例 #1
0
def LoadReaders(vcffiles):
    """
    Return list of VCF readers
    """
    if len(vcffiles) == 0:
        common.ERROR("No VCF files found")
    for f in vcffiles:
        if not f.endswith(".vcf.gz"):
            common.ERROR("Make sure %s is bgzipped and indexed"%f)
        if not os.path.isfile(f):
            common.ERROR("Could not find VCF file %s"%f)
        if not os.path.isfile(f+".tbi"):
            common.ERROR("Could not find VCF index %s.tbi"%f)
    return [vcf.Reader(open(f, "rb")) for f in vcffiles]
コード例 #2
0
def WriteMergedHeader(vcfw, args, readers, cmd):
    """
    Write merged header for VCFs in args.vcfs
    Also do some checks on the VCFs to make sure merging
    is appropriate
    """
    # Check contigs the same for all readers
    contigs = readers[0].contigs
    for i in range(1, len(readers)):
        if readers[i].contigs != contigs:
            common.ERROR("Different contigs found across VCF files. Make sure all files used the same reference")
    # Write VCF format, commands, and contigs
    vcfw.write("##fileformat=VCFv4.1\n")
    for r in readers: vcfw.write("##command="+r.metadata["command"][0]+"\n")
    vcfw.write("##command="+cmd+"\n")
    for key,val in contigs.items():
        vcfw.write("##contig=<ID=%s,length=%s>\n"%(val.id, val.length))
    # Write GangSTR specific INFO fields
    for field in ["END", "PERIOD", "RU", "REF","STUTTERP","STUTTERDOWN","STUTTERP","EXPTHRESH"]:
        vcfw.write(GetInfoString(readers[0].infos[field])+"\n")
    if args.merge_ggl: vcfw.write(GetInfoString(readers[0].infos["GRID"])+"\n")
    # Write GangSTR specific FORMAT fields
    for field in ["GT", "DP", "Q", "REPCN", "REPCI", "RC", "ML", "INS", "STDERR", "QEXP"]:
        vcfw.write(GetFormatString(readers[0].formats[field])+"\n")
    if args.merge_ggl: vcfw.write(GetFormatString(readers[0].formats["GGL"])+"\n")
    # Write sample list
    samples=GetSamples(readers, usefilenames=args.update_sample_from_file)
    header_fields = ["CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO", "FORMAT"]
    vcfw.write("#"+"\t".join(header_fields+samples)+"\n")
コード例 #3
0
ファイル: filters.py プロジェクト: shubhamsaini/STRTools
 def LoadRegions(self, filename):
     if not os.path.exists(filename):
         common.ERROR("%s not found" % filename)
     self.regions = BedTool(filename)
     if not self.regions._tabixed():
         sys.stderr.write("Creating tabix index for %s\n" % filename)
         self.regions.tabix(force=True)
コード例 #4
0
def GetSamples(readers, usefilenames=False):
    samples = []
    for r in readers:
        if usefilenames:
            samples = samples + [r.filename.strip(".vcf.gz")+":"+ s for s in r.samples]
        else: samples = samples + r.samples
    if len(set(samples))!=len(samples):
        common.ERROR("Duplicate samples found. Quitting")
    return samples
コード例 #5
0
def ParseFam(args):
    """
    Parse fam file and extract affected and unaffected sample IDs.
    Input:
    - args (namespace from parser.parse_args)

    Output:
    - isAffected ({str: bool}): dictionary for affected and unaffected sample status
    """
    filename = args.fam
    min_affec = args.affec_min_call_count
    min_unaff = args.unaff_min_call_count
    isAffected = {}
    with open(filename, 'r') as f:
        i = 0
        count_affec = 0
        count_unaff = 0
        for line in f:
            i = i + 1
            recs = line.strip().split('\t')
            if len(recs) < 6:
                common.ERROR("Insufficient number of columns in line " +
                             str(i) + " of fam file: " + filename)
            sid = recs[1]
            phe = recs[5]
            if phe == '2':
                isAffected[sid] = True
                count_affec = count_affec + 1
            else:
                isAffected[sid] = False
                count_unaff = count_unaff + 1
        if min_affec != -1:
            if count_affec < min_affec:
                common.ERROR("Minimum number of affected calls (" + str(min_affec) + \
                             ") larger than number of affected samples in fam file (" + str(count_affec) + ")")
        else:
            min_affec = count_affec
        if min_unaff != -1:
            if count_unaff < min_unaff:
                common.ERROR("Minimum number of unaffected calls (" + str(min_unaff) + \
                             ") larger than number of unaffected samples in fam file (" + str(count_unaff) + ")")
        else:
            min_unaff = count_unaff
    return isAffected, min_affec, min_unaff
コード例 #6
0
def LoadCondition(vcffile, condition, sample_order):
    reader2 = vcf.Reader(open(vcffile, "rb"))
    chrom, start = condition.split(":")
    region = "%s:%s-%s" % (chrom, start, int(start) + 1)
    reader2.fetch(region)
    for record in reader2:
        print record.start, int(start), record.ID
        if record.start == int(start):
            return LoadGT(record, sample_order, is_str=False)
    common.ERROR("Could not find SNP to condition on")
コード例 #7
0
def GetRefAllele(current_records, mergelist):
    refs = []
    chrom = ""
    pos = -1
    for i in range(len(mergelist)):
        if mergelist[i]:
            chrom = current_records[i].CHROM
            pos = current_records[i].POS
            refs.append(current_records[i].REF.upper())
    if len(set(refs)) != 1:
        common.ERROR("Conflicting refs found at %s:%s"%(chrom, pos))
    return refs[0]
コード例 #8
0
def GetInfoItem(current_records, mergelist, info_field, fail=True):
    """
    Get info item. Make sure it's the same across merged records
    if fail=True, die if items not the same
    """
    vals = set()
    for i in range(len(mergelist)):
        if mergelist[i]:
            vals.add(current_records[i].INFO[info_field])
    if len(vals)==1: return "%s=%s"%(info_field, vals.pop())
    else:
        if fail: common.ERROR("More than one value found for %s"%info_field)
        sys.stderr.write("WARNING more than one value found for %s"%info_field)
        return None
コード例 #9
0
def main():
    parser = argparse.ArgumentParser(__doc__)
    ### Required arguments ###
    req_group = parser.add_argument_group("Required arguments")
    req_group.add_argument("--vcfs", help="Comma-separated list of VCF files to merge (must be sorted, bgzipped and indexed)", type=str, required=True)
    req_group.add_argument("--out", help="Prefix to name output files", type=str, required=True)
    ### Special merge options ###
    spec_group = parser.add_argument_group("Special merge options")
    spec_group.add_argument("--update-sample-from-file", help="Use file names, rather than sample header names, when merging", action="store_true")
    spec_group.add_argument("--merge-ggl", help="Merge GGL fields", action="store_true")
    ### Optional arguments ###
    opt_group = parser.add_argument_group("Optional arguments")
    opt_group.add_argument("--verbose", help="Print out extra info", action="store_true")
    opt_group.add_argument("--quiet", help="Don't print out anything", action="store_true")
    ### Parse args ###
    args = parser.parse_args()
    if args.merge_ggl: common.ERROR("--merge-ggl not implemented yet") # TODO remove

    ### Load readers ###
    vcfreaders = LoadReaders(args.vcfs.split(","))
    contigs = vcfreaders[0].contigs
    chroms = list(contigs)

    ### Set up VCF writer ###
    vcfw = open(args.out + ".vcf", "w")
    WriteMergedHeader(vcfw, args, vcfreaders, " ".join(sys.argv))

    ### Walk through sorted readers, merging records as we go ###
    current_records = [next(reader) for reader in vcfreaders]
    is_min = GetMinRecords(current_records, chroms, debug=args.verbose)
    done = DoneReading(current_records)
    while not done:
        if args.verbose: PrintCurrentRecords(current_records, is_min)
        CheckMin(is_min)
        MergeRecords(vcfreaders, current_records, is_min, vcfw, args)
        current_records = GetNextRecords(vcfreaders, current_records, is_min)
        is_min = GetMinRecords(current_records, chroms)
        done = DoneReading(current_records)
コード例 #10
0
def CheckMin(is_min):
    if sum(is_min)==0:
        common.ERROR("Unexpected error. Stuck in infinite loop and exiting.")
コード例 #11
0
def CheckFilters(args):
    """
    Perform checks on user input for filters
    Input:
    - invcf (vcf.Reader)
    - args (argparse namespace)
    Exit program if checks fail
    """
    if args.affec_max_expansion_prob_het is not None:
        if args.affec_max_expansion_prob_het < 0 or args.affec_max_expansion_prob_het > 1:
            common.ERROR(
                "--affec-max-expansion-prob-het must be between 0 and 1")
    if args.affec_min_expansion_prob_het is not None:
        if args.affec_min_expansion_prob_het < 0 or args.affec_min_expansion_prob_het > 1:
            common.ERROR(
                "--affec-min-expansion-prob-het must be between 0 and 1")
    if args.affec_min_expansion_prob_het is not None and args.affec_max_expansion_prob_het is not None:
        if args.affec_min_expansion_prob_het > args.affec_max_expansion_prob_het:
            common.ERROR(
                "--affec-min-expansion-prob-het must be less than --affec-max-expansion-prob-het"
            )
    if args.unaff_max_expansion_prob_het is not None:
        if args.unaff_max_expansion_prob_het < 0 or args.unaff_max_expansion_prob_het > 1:
            common.ERROR(
                "--unaff-max-expansion-prob-het must be between 0 and 1")
    if args.unaff_min_expansion_prob_het is not None:
        if args.unaff_min_expansion_prob_het < 0 or args.unaff_min_expansion_prob_het > 1:
            common.ERROR(
                "--unaff-min-expansion-prob-het must be between 0 and 1")
    if args.unaff_min_expansion_prob_het is not None and args.unaff_max_expansion_prob_het is not None:
        if args.unaff_min_expansion_prob_het > args.unaff_max_expansion_prob_het:
            common.ERROR(
                "--unaff-min-expansion-prob-het must be less than --unaff-max-expansion-prob-het"
            )
    if args.affec_max_expansion_prob_hom is not None:
        if args.affec_max_expansion_prob_hom < 0 or args.affec_max_expansion_prob_hom > 1:
            common.ERROR(
                "--affec-max-expansion-prob-hom must be between 0 and 1")
    if args.affec_min_expansion_prob_hom is not None:
        if args.affec_min_expansion_prob_hom < 0 or args.affec_min_expansion_prob_hom > 1:
            common.ERROR(
                "--affec-min-expansion-prob-hom must be between 0 and 1")
    if args.affec_min_expansion_prob_hom is not None and args.affec_max_expansion_prob_hom is not None:
        if args.affec_min_expansion_prob_hom < args.affec_max_expansion_prob_hom:
            common.ERROR(
                "--affec-min-expansion-prob-hom must be less than --affec-max-expansion-prob-hom"
            )
    if args.unaff_max_expansion_prob_hom is not None:
        if args.unaff_max_expansion_prob_hom < 0 or args.unaff_max_expansion_prob_hom > 1:
            common.ERROR(
                "--unaff-max-expansion-prob-hom must be between 0 and 1")
    if args.unaff_min_expansion_prob_hom is not None:
        if args.unaff_min_expansion_prob_hom < 0 or args.unaff_min_expansion_prob_hom > 1:
            common.ERROR(
                "--unaff-min-expansion-prob-hom must be between 0 and 1")
    if args.unaff_min_expansion_prob_hom is not None and args.unaff_max_expansion_prob_hom is not None:
        if args.unaff_min_expansion_prob_hom < args.unaff_max_expansion_prob_hom:
            common.ERROR(
                "--unaff-min-expansion-prob-hom must be less than --unaff-max-expansion-prob-hom"
            )
    if args.affec_max_expansion_prob_total is not None:
        if args.affec_max_expansion_prob_total < 0 or args.affec_max_expansion_prob_total > 1:
            common.ERROR(
                "--affec-max-expansion-prob-total must be between 0 and 1")
    if args.affec_min_expansion_prob_total is not None:
        if args.affec_min_expansion_prob_total < 0 or args.affec_min_expansion_prob_total > 1:
            common.ERROR(
                "--affec-min-expansion-prob-total must be between 0 and 1")
    if args.affec_min_expansion_prob_total is not None and args.affec_max_expansion_prob_total is not None:
        if args.affec_min_expansion_prob_total < args.affec_max_expansion_prob_total:
            common.ERROR(
                "--affec-min-expansion-prob-total must be less than --affec-max-expansion-prob-total"
            )
    if args.unaff_max_expansion_prob_total is not None:
        if args.unaff_max_expansion_prob_total < 0 or args.unaff_max_expansion_prob_total > 1:
            common.ERROR(
                "--unaff-max-expansion-prob-total must be between 0 and 1")
    if args.unaff_min_expansion_prob_total is not None:
        if args.unaff_min_expansion_prob_total < 0 or args.unaff_min_expansion_prob_total > 1:
            common.ERROR(
                "--unaff-min-expansion-prob-total must be between 0 and 1")
    if args.unaff_min_expansion_prob_total is not None and args.unaff_max_expansion_prob_total is not None:
        if args.unaff_min_expansion_prob_total < args.unaff_max_expansion_prob_total:
            common.ERROR(
                "--unaff-min-expansion-prob-total must be less than --unaff-max-expansion-prob-total"
            )
    if args.affec_min_call_count != -1 and args.affec_min_call_count < 0:
        common.ERROR("Minimum number of affected calls (" + str(args.affec_min_call_count) + \
                     ") must be 0 or more")
    if args.unaff_min_call_count != -1 and args.unaff_min_call_count < 0:
        common.ERROR("Minimum number of unaffected calls (" + str(args.unaff_min_call_count) + \
                     ") must be 0 or more")
コード例 #12
0
def main():
    parser = argparse.ArgumentParser(__doc__)
    inout_group = parser.add_argument_group("Input/output")
    inout_group.add_argument("--vcf", help="Input VCF file", type=str)
    inout_group.add_argument("--out", help="Output prefix", type=str)
    inout_group.add_argument("--fam",
                             help="FAM file with phenotype info",
                             type=str)
    inout_group.add_argument("--samples",
                             help="File with list of samples to include",
                             type=str)
    inout_group.add_argument("--exclude-samples",
                             help="File with list of samples to exclude",
                             type=str)
    pheno_group = parser.add_argument_group("Phenotypes")
    pheno_group.add_argument("--pheno",
                             help="Phenotypes file (to use instead of --fam)",
                             type=str)
    pheno_group.add_argument("--mpheno",
                             help="Use (n+2)th column from --pheno",
                             type=int,
                             default=1)
    pheno_group.add_argument("--missing-phenotype",
                             help="Missing phenotype code",
                             type=str,
                             default="-9")
    covar_group = parser.add_argument_group("Covariates")
    covar_group.add_argument("--covar", help="Covariates file", type=str)
    covar_group.add_argument(
        "--covar-name",
        help="Names of covariates to load. Comma-separated",
        type=str)
    covar_group.add_argument(
        "--covar-number",
        help="Column number of covariates to load. Comma-separated",
        type=str)
    covar_group.add_argument("--sex",
                             help="Include sex from fam file as covariate",
                             action="store_true")
    covar_group.add_argument("--cohort-pgc",
                             help="Use cohort from PGC FIDs as a covariate",
                             action="store_true")
    assoc_group = parser.add_argument_group("Association testing")
    assoc_group.add_argument("--linear",
                             help="Perform linear regression",
                             action="store_true")
    assoc_group.add_argument("--logistic",
                             help="Perform logistic regression",
                             action="store_true")
    assoc_group.add_argument("--region",
                             help="Only process this region (chrom:start-end)",
                             type=str)
    assoc_group.add_argument("--infer-snpstr",
                             help="Infer which positions are SNPs vs. STRs",
                             action="store_true")
    assoc_group.add_argument(
        "--allele-tests",
        help="Also perform allele-based tests using each separate allele",
        action="store_true")
    assoc_group.add_argument(
        "--allele-tests-length",
        help="Also perform allele-based tests using allele length",
        action="store_true")
    assoc_group.add_argument("--minmaf",
                             help="Ignore bi-allelic sites with low MAF",
                             type=float,
                             default=0.01)
    assoc_group.add_argument("--str-only",
                             help="Used with --infer-snptr, only analyze STRs",
                             action="store_true")
    assoc_group.add_argument(
        "--remove-rare-str-alleles",
        help="Remove genotypes with alleles less than this freq",
        default=0.0,
        type=float)
    assoc_group.add_argument(
        "--max-iter",
        help="Maximum number of iterations for logistic regression",
        default=100,
        type=int)
    fm_group = parser.add_argument_group("Fine mapping")
    fm_group.add_argument("--condition",
                          help="Condition on this position chrom:start",
                          type=str)
    args = parser.parse_args()
    # Some initial checks
    if int(args.linear) + int(args.logistic) != 1:
        ERROR("Must choose one of --linear or --logistic")

    # Load phenotype information
    common.MSG("Loading phenotype information...")
    if args.fam is not None:
        pdata = LoadPhenoData(args.fam,
                              fam=True,
                              missing=args.missing_phenotype,
                              sex=args.sex)
    elif args.pheno is not None:
        if args.sex: ERROR("--sex only works when using --fam (not --pheno)")
        pdata = LoadPhenoData(args.pheno,
                              fam=False,
                              missing=args.missing_phenotype,
                              mpheno=args.mpheno)
    else:
        common.ERROR("Must specify phenotype using either --fam or --pheno")
    common.MSG("Loaded %s samples..." % pdata.shape[0])

    # Load covariate information
    common.MSG("Loading covariate information...")
    covarcols = []
    if args.covar is not None:
        pdata, covarcols = AddCovars(pdata, args.covar, args.covar_name,
                                     args.covar_number)
    if args.sex: covarcols.append("sex")
    if args.cohort_pgc:
        pdata["cohort"] = pdata["FID"].apply(lambda x: x.split("*")[0])
        covarcols.append("cohort")
    common.MSG("Loaded %s samples..." % pdata.shape[0])

    # Include/exclude samples
    common.MSG("Loading sample information...")
    if args.samples is not None:
        pdata = RestrictSamples(pdata, args.samples, include=True)
    if args.exclude_samples is not None:
        pdata = RestrictSamples(pdata, args.exclude_samples, include=False)
    common.MSG("Left with %s samples..." % pdata.shape[0])

    # Setup VCF reader
    common.MSG("Set up VCF reader")
    reader = vcf.Reader(open(args.vcf, "rb"))

    # Set sample ID to FID_IID to match vcf
    common.MSG("Set up sample info")
    pdata["sample"] = pdata.apply(lambda x: x["FID"] + "_" + x["IID"], 1)
    reader_samples = set(reader.samples)
    pdata = pdata[pdata["sample"].apply(lambda x: x in reader_samples)]
    sample_order = list(pdata["sample"])
    pdata = pdata[["phenotype", "sample"] + covarcols]
    common.MSG("Left with %s samples..." % pdata.shape[0])

    # Get data to condition on
    if args.condition is not None:
        cond_gt = LoadCondition(args.vcf, args.condition, sample_order)
        pdata["condition"] = cond_gt[0]
        covarcols.append("condition")

    # Prepare output file
    if args.out == "stdout":
        outf = sys.stdout
    else:
        outf = open(args.out, "w")
    PrintHeader(outf,
                case_control=args.logistic,
                quant=args.linear,
                comment_lines=[" ".join(sys.argv)])

    # Perform association test for each record
    common.MSG("Perform associations... with covars %s" % str(covarcols))
    if args.region: reader = reader.fetch(args.region)
    for record in reader:
        # Check MAF
        aaf = sum(record.aaf)
        aaf = min([aaf, 1 - aaf])
        if aaf < args.minmaf: continue
        # Infer whether we should treat as a SNP or STR
        is_str = True  # by default, assume all data is STRs
        if args.infer_snpstr:
            if len(record.REF) == 1 and len(record.ALT) == 1 and len(
                    record.ALT[0]) == 1:
                is_str = False
            if is_str and len(record.REF) < MIN_STR_LENGTH:
                continue  # probably an indel
            if not is_str and args.str_only: continue
        # Extract genotypes in sample order, perform regression, and output
        common.MSG("   Load genotypes...")
        gts, exclude_samples = LoadGT(record,
                                      sample_order,
                                      is_str=is_str,
                                      rmrare=args.remove_rare_str_alleles)
        pdata["GT"] = gts
        if is_str: minmaf = 1
        else: minmaf = args.minmaf
        common.MSG("   Perform association...")
        assoc = PerformAssociation(pdata,
                                   covarcols,
                                   case_control=args.logistic,
                                   quant=args.linear,
                                   minmaf=minmaf,
                                   exclude_samples=exclude_samples,
                                   maxiter=args.max_iter)
        common.MSG("   Output association...")
        OutputAssoc(record.CHROM,
                    record.POS,
                    assoc,
                    outf,
                    assoc_type=GetAssocType(is_str, name=record.ID))
        # Allele based tests
        common.MSG("   Allele based tests...")
        if is_str and args.allele_tests:
            alleles = [record.REF] + record.ALT
            for i in range(len(record.ALT) + 1):
                gts, exclude_samples = LoadGT(record,
                                              sample_order,
                                              is_str=True,
                                              use_alt_num=i)
                pdata["GT"] = gts
                assoc = PerformAssociation(pdata,
                                           covarcols,
                                           case_control=args.logistic,
                                           quant=args.linear,
                                           exclude_samples=exclude_samples,
                                           maxiter=args.max_iter)
                OutputAssoc(record.CHROM,
                            record.POS,
                            assoc,
                            outf,
                            assoc_type=GetAssocType(is_str, alt=alleles[i]))
        if is_str and args.allele_tests_length:
            for length in set([len(record.REF)] +
                              [len(alt) for alt in record.ALT]):
                gts, exclude_samples = LoadGT(record,
                                              sample_order,
                                              is_str=True,
                                              use_alt_length=length)
                pdata["GT"] = gts
                assoc = PerformAssociation(pdata,
                                           covarcols,
                                           case_control=args.logistic,
                                           quant=args.linear,
                                           exclude_samples=exclude_samples,
                                           maxiter=args.max_iter)
                OutputAssoc(record.CHROM,
                            record.POS,
                            assoc,
                            outf,
                            assoc_type=GetAssocType(is_str, alt_len=length))