def setup_clinvar_file():
    local_storage = os.path.join(settings.LOCAL_STORAGE_ROOT,
                                 'genome_processing_files')
    if not os.path.exists(local_storage):
        os.makedirs(local_storage)
    clinvar_filepath = clinvar_update.get_latest_vcf_file(
        target_dir=local_storage, build='b37')
    if clinvar_filepath.endswith('.bz2'):
        clinvar_file = bz2.BZ2File(clinvar_filepath, 'rb')
    elif clinvar_filepath.endswith('.gz'):
        clinvar_file = gzip.open(clinvar_filepath, 'rb')
    else:
        clinvar_file = open(clinvar_filepath)
    return clinvar_file
Exemple #2
0
def setup_clinvar_file():
    local_storage = os.path.join(settings.LOCAL_STORAGE_ROOT,
                                 'genome_processing_files')
    if not os.path.exists(local_storage):
        os.makedirs(local_storage)
    clinvar_filepath = clinvar_update.get_latest_vcf_file(
        target_dir=local_storage, build='b37')
    if clinvar_filepath.endswith('.bz2'):
        clinvar_file = bz2.BZ2File(clinvar_filepath, 'rb')
    elif clinvar_filepath.endswith('.gz'):
        clinvar_file = gzip.open(clinvar_filepath, 'rb')
    else:
        clinvar_file = open(clinvar_filepath)
    return clinvar_file
Exemple #3
0
def setup_clinvar_data():
    local_storage = os.path.join(settings.LOCAL_STORAGE_ROOT,
                                 'genome_processing_files')
    if not os.path.exists(local_storage):
        os.makedirs(local_storage)
    clinvar_filepath = clinvar_update.get_latest_vcf_file(
        target_dir=local_storage, build='b37')
    clinvar_sig_filepath = '{}.sigposlist.json.gz'.format(clinvar_filepath)
    if os.path.exists(clinvar_sig_filepath):
        clinvar_sig_file = gzip.open(clinvar_sig_filepath, 'rt')
        clinvar_sig = json.load(clinvar_sig_file)
    else:
        clinvar_sig = generate_clinvar_sig(clinvar_filepath,
                                           clinvar_sig_filepath)
    return set(clinvar_sig)
def match_genome(inputfile, outputfile, inputfilename):
    """
    Produce a CSV genome report at outputfile for a given VCF inputfile.
    """
    data = dict()

    # Set up ClinVar data.
    clinvar_filepath = clinvar_update.get_latest_vcf_file(FILESDIR, 'b37')
    if clinvar_filepath.endswith('.vcf'):
            input_clinvar_file = open(clinvar_filepath)
    elif clinvar_filepath.endswith('.vcf.gz'):
        input_clinvar_file = gzip.open(clinvar_filepath)
    elif clinvar_filepath.endswith('.vcf.bz2'):
        input_clinvar_file = bz2.BZ2File(clinvar_filepath)
    else:
        raise IOError("ClinVar filename expected to end with '.vcf'," +
                      " '.vcf.gz', or '.vcf.bz2'.")

    # Run vcf2clinvar on genome data.
    clinvar_matches = vcf2clinvar.match_to_clinvar(
        inputfile, input_clinvar_file)
    # Set up to get myvariant.info data (mainly for ExAC data.)
    mv = myvariant.MyVariantInfo()

    # iterate through all ClinVar matches.
    for genome_vcf_line, allele, zygosity in clinvar_matches:
        # Discard low quality data.
        if genome_vcf_line.filters and 'PASS' not in genome_vcf_line.filters:
            continue
        # Check significance. Only keep this as a notable variant if one of the
        # submissions has reported "pathogenic" and "likely pathogenic" effect.
        sigs = [rec.sig for rec in allele.records]
        if not ('4' in sigs or '5' in sigs):
            continue
        # Store data in a dict according to HGVS position.
        poskey = myvariant.format_hgvs(
            genome_vcf_line.chrom,
            genome_vcf_line.start,
            genome_vcf_line.ref_allele,
            allele.sequence)
        data[poskey] = {'genome_vcf_line': genome_vcf_line,
                        'clinvar_allele': allele,
                        'zygosity': zygosity}

    # Add data from myvariant.info using the HGVS positions.
    variants = data.keys()
    mv_output = mv.getvariants(variants, fields=['clinvar', 'exac'])
    for i in range(len(variants)):
        if 'clinvar' in mv_output[i]:
            data[variants[i]]['mv_clinvar'] = mv_output[i]['clinvar']
        if 'exac' in mv_output[i]:
            data[variants[i]]['mv_exac'] = mv_output[i]['exac']

    # Write report as CSV.
    with open(outputfile, 'w') as f:
        csv_out = csv.writer(f)
        for var in variants:
            # Clinvar URL for variant.
            cv_url = 'http://www.ncbi.nlm.nih.gov/clinvar/{}/'.format(
                data[var]['clinvar_allele'].records[0].acc)
            disease_name = ''
            preferred_name = ''
            getev_url = ''
            # Disease name, preferred name, and GET-Evidence URL if we have
            # myvariant.info information with ClinVar data.
            if 'mv_clinvar' in data[var]:
                cv_url = 'http://www.ncbi.nlm.nih.gov/clinvar/variation/{}/'.format(
                    data[var]['mv_clinvar']['variant_id'])
                try:
                    disease_name = data[var]['mv_clinvar']['rcv']['conditions']['name']
                    preferred_name = data[var]['mv_clinvar']['rcv']['preferred_name']
                except TypeError:
                    disease_name = ', '.join(
                        set([rcv['conditions']['name'] for rcv in
                            data[var]['mv_clinvar']['rcv']]))
                    preferred_name = data[var]['mv_clinvar']['rcv'][0]['preferred_name']
                getev_url = guess_getevidence_url(preferred_name)
            exac_url = 'http://exac.broadinstitute.org/variant/{}-{}-{}-{}'.format(
                data[var]['genome_vcf_line'].chrom[3:],
                data[var]['genome_vcf_line'].start,
                data[var]['genome_vcf_line'].ref_allele,
                data[var]['clinvar_allele'].sequence)
            # Allele frequency using ExAC data, if myvariant.info had that.
            if 'mv_exac' in data[var]:
                total_freq = data[var]['mv_exac']['ac']['ac'] * 1.0 / data[var]['mv_exac']['an']['an']
                total_freq = str(total_freq)
                freq_source = 'ExAC'
            else:
                # If not, try to get it from our ClinVar data.
                try:
                    total_freq = str(data[var]['clinvar_allele'].frequency)
                    freq_source = 'ClinVar'
                except KeyError:
                    # If that fails, give up on frequency.
                    total_freq = ''
                    freq_source = 'Unknown'
            data_row = [
                inputfilename, var, preferred_name, disease_name, cv_url,
                exac_url, total_freq, freq_source, getev_url]
            csv_out.writerow(data_row)
    return
Exemple #5
0
def match_genome(inputfile, outputfile, inputfilename):
    """
    Produce a CSV genome report at outputfile for a given VCF inputfile.
    """
    data = dict()

    # Set up ClinVar data.
    clinvar_filepath = clinvar_update.get_latest_vcf_file(FILESDIR, 'b37')
    if clinvar_filepath.endswith('.vcf'):
        input_clinvar_file = open(clinvar_filepath)
    elif clinvar_filepath.endswith('.vcf.gz'):
        input_clinvar_file = gzip.open(clinvar_filepath)
    elif clinvar_filepath.endswith('.vcf.bz2'):
        input_clinvar_file = bz2.BZ2File(clinvar_filepath)
    else:
        raise IOError("ClinVar filename expected to end with '.vcf'," +
                      " '.vcf.gz', or '.vcf.bz2'.")

    # Run vcf2clinvar on genome data.
    clinvar_matches = vcf2clinvar.match_to_clinvar(inputfile,
                                                   input_clinvar_file)
    # Set up to get myvariant.info data (mainly for ExAC data.)
    mv = myvariant.MyVariantInfo()

    # iterate through all ClinVar matches.
    for genome_vcf_line, allele, zygosity in clinvar_matches:
        # Discard low quality data.
        if genome_vcf_line.filters and 'PASS' not in genome_vcf_line.filters:
            continue
        # Check significance. Only keep this as a notable variant if one of the
        # submissions has reported "pathogenic" and "likely pathogenic" effect.
        sigs = [rec.sig for rec in allele.records]
        if not ('4' in sigs or '5' in sigs):
            continue
        # Store data in a dict according to HGVS position.
        poskey = myvariant.format_hgvs(genome_vcf_line.chrom,
                                       genome_vcf_line.start,
                                       genome_vcf_line.ref_allele,
                                       allele.sequence)
        data[poskey] = {
            'genome_vcf_line': genome_vcf_line,
            'clinvar_allele': allele,
            'zygosity': zygosity
        }

    # Add data from myvariant.info using the HGVS positions.
    variants = data.keys()
    mv_output = mv.getvariants(variants, fields=['clinvar', 'exac'])
    for i in range(len(variants)):
        if 'clinvar' in mv_output[i]:
            data[variants[i]]['mv_clinvar'] = mv_output[i]['clinvar']
        if 'exac' in mv_output[i]:
            data[variants[i]]['mv_exac'] = mv_output[i]['exac']

    # Write report as CSV.
    with open(outputfile, 'w') as f:
        csv_out = csv.writer(f)
        for var in variants:
            # Clinvar URL for variant.
            cv_url = 'http://www.ncbi.nlm.nih.gov/clinvar/{}/'.format(
                data[var]['clinvar_allele'].records[0].acc)
            disease_name = ''
            preferred_name = ''
            getev_url = ''
            # Disease name, preferred name, and GET-Evidence URL if we have
            # myvariant.info information with ClinVar data.
            if 'mv_clinvar' in data[var]:
                cv_url = 'http://www.ncbi.nlm.nih.gov/clinvar/variation/{}/'.format(
                    data[var]['mv_clinvar']['variant_id'])
                try:
                    disease_name = data[var]['mv_clinvar']['rcv'][
                        'conditions']['name']
                    preferred_name = data[var]['mv_clinvar']['rcv'][
                        'preferred_name']
                except TypeError:
                    disease_name = ', '.join(
                        set([
                            rcv['conditions']['name']
                            for rcv in data[var]['mv_clinvar']['rcv']
                        ]))
                    preferred_name = data[var]['mv_clinvar']['rcv'][0][
                        'preferred_name']
                getev_url = guess_getevidence_url(preferred_name)
            exac_url = 'http://exac.broadinstitute.org/variant/{}-{}-{}-{}'.format(
                data[var]['genome_vcf_line'].chrom[3:],
                data[var]['genome_vcf_line'].start,
                data[var]['genome_vcf_line'].ref_allele,
                data[var]['clinvar_allele'].sequence)
            # Allele frequency using ExAC data, if myvariant.info had that.
            if 'mv_exac' in data[var]:
                total_freq = data[var]['mv_exac']['ac']['ac'] * 1.0 / data[
                    var]['mv_exac']['an']['an']
                total_freq = str(total_freq)
                freq_source = 'ExAC'
            else:
                # If not, try to get it from our ClinVar data.
                try:
                    total_freq = str(data[var]['clinvar_allele'].frequency)
                    freq_source = 'ClinVar'
                except KeyError:
                    # If that fails, give up on frequency.
                    total_freq = ''
                    freq_source = 'Unknown'
            data_row = [
                inputfilename, var, preferred_name, disease_name, cv_url,
                exac_url, total_freq, freq_source, getev_url
            ]
            csv_out.writerow(data_row)
    return
def main():
    """
    Parse command line argument and
    output appropriate file type (csv or JSON)
    """
    parser = ArgumentParser()

    parser.add_argument(
        "-c", "--clinvarfile", dest="clinvarfile",
        help="ClinVar VCF file (either this or -C must be specified)",
        metavar="CLINVARFILE")
    parser.add_argument(
        "-C", "--clinvardir", dest="clinvardir",
        help="ClinVar VCF directory (either this or -c must be specified). " +
        "This option will use vcf2clinvar.clinvar_update to automatically " +
        "check and import the most recent ClinVar file to this directory.",
        metavar="CLINVARDIR")
    parser.add_argument(
        "-i", "--input", dest="inputfile",
        help="Input VCF file ['.vcf', '.vcf.gz', '.vcf.bz2']. " +
        "Uncompressed genome data is also accepted via stdin.",
        metavar="INPUT")
    parser.add_argument(
        "-t", "--type", dest="type", default='csv',
        help="Output report type ('csv' or 'json'). Defaults to csv. " +
        "CSV Report: Reports all genome variants matching ClinVar records, " +
        "and some summary ClinVar data from these records. Header lines " +
        "with metadata begin with '##'.\n" +
        "JSON Report: Reports genome variants matching ClinVar records " +
        "(no record information is included).",
        metavar="TYPE")
    parser.add_argument(
        "-n", "--notes", dest="notes",
        help="Notes (JSON format) to include in report. (JSON report only)",
        metavar="NOTES")
    parser.add_argument(
        "-g", "--genome-build", dest="build",
        help="Genome build to include in report ('b37' or 'b38').",
        metavar="GENOMEBUILD")
    options = parser.parse_args()

    #version = os.popen("python setup.py --version").read().strip()
    version = "0.1.2a"

    if options.inputfile:
        if options.inputfile.endswith('.vcf'):
            input_genome_file = open(options.inputfile)
        elif options.inputfile.endswith('.vcf.gz'):
            input_genome_file = gzip.open(options.inputfile)
        elif options.inputfile.endswith('.vcf.bz2'):
            input_genome_file = bz2.BZ2File(options.inputfile)
        else:
            raise IOError("Genome filename expected to end with ''.vcf'," +
                          " '.vcf.gz', or '.vcf.bz2'.")
    elif not sys.stdin.isatty():
        input_genome_file = sys.stdin
    else:
        sys.stderr.write("Provide input VCF file\n")
        parser.print_help()
        sys.exit(1)

    if options.build and options.build in ['b37', 'b38']:
        build = options.build
    else:
        raise IOError("Input VCF genome build must be 'b37' or 'b38'.")

    if (not (options.clinvarfile or options.clinvardir) or
            (options.clinvarfile and options.clinvardir)):
        sys.stderr.write("Please provide either a ClinVar file or directory.")
        parser.print_help()
        sys.exit(1)
    if options.clinvarfile:
        clinvarfilename = options.clinvarfile
    elif options.clinvardir:
        clinvarfilename = get_latest_vcf_file(target_dir=options.clinvardir,
                                              build=build)
    if clinvarfilename.endswith('.vcf'):
        input_clinvar_file = open(options.clinvarfile)
    elif clinvarfilename.endswith('.vcf.gz'):
        input_clinvar_file = gzip.open(clinvarfilename)
    elif clinvarfilename.endswith('.vcf.bz2'):
        input_clinvar_file = bz2.BZ2File(clinvarfilename)
    else:
        raise IOError("ClinVar filename expected to end with '.vcf'," +
                      " '.vcf.gz', or '.vcf.bz2'.")

    if options.type not in ['csv', 'json']:
        raise IOError("Not a valid report type, must be 'csv' or 'json'.")
    if options.type == "csv":
        csv_report(input_genome_file=input_genome_file,
                   input_clinvar_file=input_clinvar_file,
                   build=build,
                   version=version)
    elif options.type == "json":
        notes_json = {}
        if options.notes:
            notes_json["parameter"] = options.notes
            try:
                notes_json = json.loads(options.notes)
            except:
                sys.stderr.write("Could not parse JSON notes field\n")
        json_report(input_genome_file=input_genome_file,
                    input_clinvar_file=input_clinvar_file,
                    build=build,
                    notes=notes_json,
                    version=version)
def main():
    """
    Parse command line argument and
    output appropriate file type (csv or JSON)
    """
    parser = ArgumentParser()

    parser.add_argument(
        "-c",
        "--clinvarfile",
        dest="clinvarfile",
        help="ClinVar VCF file (either this or -C must be specified)",
        metavar="CLINVARFILE")
    parser.add_argument(
        "-C",
        "--clinvardir",
        dest="clinvardir",
        help="ClinVar VCF directory (either this or -c must be specified). " +
        "This option will use vcf2clinvar.clinvar_update to automatically " +
        "check and import the most recent ClinVar file to this directory.",
        metavar="CLINVARDIR")
    parser.add_argument(
        "-i",
        "--input",
        dest="inputfile",
        help="Input VCF file ['.vcf', '.vcf.gz', '.vcf.bz2']. " +
        "Uncompressed genome data is also accepted via stdin.",
        metavar="INPUT")
    parser.add_argument(
        "-t",
        "--type",
        dest="type",
        default='csv',
        help="Output report type ('csv' or 'json'). Defaults to csv. " +
        "CSV Report: Reports all genome variants matching ClinVar records, " +
        "and some summary ClinVar data from these records. Header lines " +
        "with metadata begin with '##'.\n" +
        "JSON Report: Reports genome variants matching ClinVar records " +
        "(no record information is included).",
        metavar="TYPE")
    parser.add_argument(
        "-n",
        "--notes",
        dest="notes",
        help="Notes (JSON format) to include in report. (JSON report only)",
        metavar="NOTES")
    parser.add_argument(
        "-g",
        "--genome-build",
        dest="build",
        help="Genome build to include in report ('b37' or 'b38').",
        metavar="GENOMEBUILD")
    options = parser.parse_args()

    version = os.popen("python setup.py --version").read().strip()

    if options.inputfile:
        if options.inputfile.endswith('.vcf'):
            input_genome_file = open(options.inputfile)
        elif options.inputfile.endswith('.vcf.gz'):
            input_genome_file = gzip.open(options.inputfile)
        elif options.inputfile.endswith('.vcf.bz2'):
            input_genome_file = bz2.BZ2File(options.inputfile)
        else:
            raise IOError("Genome filename expected to end with ''.vcf'," +
                          " '.vcf.gz', or '.vcf.bz2'.")
    elif not sys.stdin.isatty():
        input_genome_file = sys.stdin
    else:
        sys.stderr.write("Provide input VCF file\n")
        parser.print_help()
        sys.exit(1)

    if options.build and options.build in ['b37', 'b38']:
        build = options.build
    else:
        raise IOError("Input VCF genome build must be 'b37' or 'b38'.")

    if (not (options.clinvarfile or options.clinvardir)
            or (options.clinvarfile and options.clinvardir)):
        sys.stderr.write("Please provide either a ClinVar file or directory.")
        parser.print_help()
        sys.exit(1)
    if options.clinvarfile:
        clinvarfilename = options.clinvarfile
    elif options.clinvardir:
        clinvarfilename = get_latest_vcf_file(target_dir=options.clinvardir,
                                              build=build)
    if clinvarfilename.endswith('.vcf'):
        input_clinvar_file = open(options.clinvarfile)
    elif clinvarfilename.endswith('.vcf.gz'):
        input_clinvar_file = gzip.open(clinvarfilename)
    elif clinvarfilename.endswith('.vcf.bz2'):
        input_clinvar_file = bz2.BZ2File(clinvarfilename)
    else:
        raise IOError("ClinVar filename expected to end with '.vcf'," +
                      " '.vcf.gz', or '.vcf.bz2'.")

    if options.type not in ['csv', 'json']:
        raise IOError("Not a valid report type, must be 'csv' or 'json'.")
    if options.type == "csv":
        csv_report(input_genome_file=input_genome_file,
                   input_clinvar_file=input_clinvar_file,
                   build=build,
                   version=version)
    elif options.type == "json":
        notes_json = {}
        if options.notes:
            notes_json["parameter"] = options.notes
            try:
                notes_json = json.loads(options.notes)
            except:
                sys.stderr.write("Could not parse JSON notes field\n")
        json_report(input_genome_file=input_genome_file,
                    input_clinvar_file=input_clinvar_file,
                    build=build,
                    notes=notes_json,
                    version=version)