Ejemplo n.º 1
0
def get_ga4gh_subpops(baseURL):

    httpclient = client.HttpClient(baseURL)
    datasets = list(httpclient.search_datasets())
    datasetId=datasets[0].id
    individuals = httpclient.search_individuals(datasetId)
    ancestry_dict = { i.name: i.description for i in individuals }
    return ancestry_dict
Ejemplo n.º 2
0
def runDemo():

    httpClient = client.HttpClient("http://localhost:8000")
    iterator = httpClient.search_variants(
        "WyIxa2ctcDMtc3Vic2V0IiwidnMiLCJtdm5jYWxsIl0",
        reference_name="1", start=45000, end=50000)
    for variant in iterator:
        print(
            variant.reference_name, variant.start, variant.end,
            variant.reference_bases, variant.alternate_bases, sep="\t")
Ejemplo n.º 3
0
 def setUp(self):
     self.httpClient = client.HttpClient("http://example.com")
     self.httpClient._run_search_request = mock.Mock()
     self.httpClient._run_get_request = mock.Mock()
     self.httpClient._run_list_request = mock.Mock()
     self.httpClient._run_get_request_path = mock.Mock()
     self.httpClient._run_post_request = mock.Mock()
     self.objectId = "SomeId"
     self.objectName = "objectName"
     self.datasetId = "datasetId"
     self.variantSetId = "variantSetId"
     self.variantAnnotationSetId = "variantAnnotationSetId"
     self.featureSetId = "featureSetId"
     self.continuousSetId = "continuousSetId"
     self.parentId = "parentId"
     self.feature = "feature"
     self.referenceSetId = "referenceSetId"
     self.referenceId = "referenceId"
     self.readGroupIds = ["readGroupId"]
     self.referenceName = "referenceName"
     self.biosampleId = "biosampleId"
     self.biosampleName = "biosampleName"
     self.individualName = "individualName"
     self.individualId = "individualId"
     self.geneSymbol = "geneSymbol"
     self.start = 100
     self.end = 101
     self.referenceName = "referenceName"
     self.callSetIds = ["id1", "id2"]
     self.pageSize = 1000
     self.httpClient.set_page_size(self.pageSize)
     self.assemblyId = "assemblyId"
     self.accession = "accession"
     self.md5checksum = "md5checksum"
     self.phenotype_association_set_id = "phenotype_association_set_id"
     self.feature_ids = ["id1", "id2"]
     self.phenotype_ids = ["id3", "id4"]
     self.evidence = protocol.EvidenceQuery()
     self.rnaQuantificationSetId = "rnaQuantificationSetId"
     self.rnaQuantificationId = "rnaQuantificationId"
     self.expressionLevelId = "expressionLevelId"
     self.threshold = 0.0
Ejemplo n.º 4
0
def build_variant_alias_dictionary(gene):

    ga4gh_client = client.HttpClient(BRCA_GA4GH_URL)
    variant_aliases = {}
    variant_count = 0
    for variant in ga4gh_client.search_variants(reference_name=gene.chromosome,
                                     variant_set_id="brca-hg37",
                                     start=gene.start-GENE_BUFFER,
                                     end=gene.end+GENE_BUFFER):
        variant_count += 1
        for name in variant.names:
             variant_aliases[name]= variant.id
        if len(variant.info["HGVS_cDNA"]) > 1:
            print variant.info["HGVS_cDNA"][0], type(variant.info["HGVS_cDNA"][0])
            sys.exit()
        if variant_count % 100 == 0:
            "Downloading BRCA variants:", variant_count, gene.name, "variants downloaded"
        variant_aliases[str(variant.info["HGVS_cDNA"][0])] = variant.id
        variant_aliases[str(variant.info["HGVS_Protein"][0])] = variant.id
    print "VARIANT COUNT IS: ", variant_count
    return variant_aliases
Ejemplo n.º 5
0
#!/usr/bin/python

from ga4gh.client import client
import json
import ga4gh.client.protocol as protocol

ga4gh_endpoint = "http://10.96.11.130:8000"
c = client.HttpClient(ga4gh_endpoint)

def harvest(genes):
    datasets = c.search_datasets()
    phenotype_association_set_id = None
    phenotype_association_set_name = None
    for  dataset in datasets:
      phenotype_association_sets = c.search_phenotype_association_sets(dataset_id=dataset.id)
      for phenotype_association_set in phenotype_association_sets:
        phenotype_association_set_id = phenotype_association_set.id
        phenotype_association_set_name = phenotype_association_set.name
        # print 'Found G2P phenotype_association_set:', phenotype_association_set.id, phenotype_association_set.name
        break

    assert phenotype_association_set_id
    assert phenotype_association_set_name

    feature_set_id = None
    datasets = c.search_datasets()
    for  dataset in datasets:
      featuresets = c.search_feature_sets(dataset_id=dataset.id)
      for featureset in featuresets:
        if phenotype_association_set_name in featureset.name:
          feature_set_id = featureset.id
Ejemplo n.º 6
0
import requests.packages.urllib3
import datetime
from ga4gh.client import client
from itertools import chain
requests.packages.urllib3.disable_warnings()

httpClient = client.HttpClient(
    "https://brcaexchange.org/backend/data/ga4gh/v0.6.0a7/")
chrom = {"BRCA1": "chr17", "BRCA2": "chr13"}

annotCols = ['id', 'Pathogenicity_all']


def brca_query(gene, start_pos, end_pos):

    query = httpClient.search_variants(reference_name=chrom[gene],
                                       variant_set_id="brca-hg37",
                                       start=int(start_pos),
                                       end=int(end_pos))

    listOutput = []
    for var in query:
        posInfo = [
            var.info['Gene_Symbol'].values[0].string_value, var.reference_name,
            var.start, var.end,
            str(var.reference_bases),
            str(var.alternate_bases[0])
        ]
        annotInfo = [
            var.info[x].values[0].string_value if x in var.info.keys() else ''
            for x in annotCols
Ejemplo n.º 7
0
import json
from ga4gh.client import client

if __name__ == '__main__':

    # [1] Boilerplate code to initialize GA4GH client
    c = client.HttpClient("http://1kgenomes.ga4gh.org")
    dataset = c.search_datasets().next()

    # [2] Fetch variant set
    for variant_set in c.search_variant_sets(dataset_id=dataset.id):
        if variant_set.name == "phase3-release":
            var_set = variant_set

    # [3] Get metadata, store in dictionary
    metadata = {'data': []}
    for data in variant_set.metadata:
        if '.' in data.key:
            key, identity = (str(x) for x in data.key.split('.'))
            metadata['data'].append({
                'key': key,
                'id': identity,
                'number': data.number,
                'type': data.type,
                'description': data.description
            })

    # [4] Write hardcoded metadata tags not on server
    metadata['data'].append({
        'key': 'FORMAT',
        'id': 'GT',
Ejemplo n.º 8
0
from __future__ import print_function
import ga4gh.client.client as client
rahman_client = client.HttpClient("http://52.160.96.216/ga4gh")
"""
******************** get datasets and variant_sets ******************* 
"""
datasets = list(rahman_client.search_datasets())

dataset = rahman_client.get_dataset(datasets[0].id)

release = None
functional = None
for variant_set in rahman_client.search_variant_sets(dataset_id=dataset.id):
    if variant_set.name == "phase3-release":
        release = variant_set
    else:
        functional = variant_set
"""
*********************************************************************** 
"""


def main():
    callsi = list(rahman_client.search_call_sets(functional.id))
    variant_sets = list(rahman_client.search_variant_sets(dataset.id))
    variant_set_id = variant_sets[0].id
    print(variant_set_id)

    call_set_ids = []
    callsi = list(rahman_client.search_call_sets(functional.id))
    for csi in callsi:
Ejemplo n.º 9
0
def get_ga4gh_variants_dataframe(url, chrom, start, end, results, snps_only):
    """
    Returns a DataFrame of genotypes within the requested coordinates for all
    callsets.
    
    e.g.
                            index  HG00099  HG001031
    0    10_94951137_94951138_C_A      0.0      0.0    
    1    10_94951708_94951709_C_T      0.0      0.0    
    2    11_89179334_89179335_T_C      0.0      0.0    
    3    11_89183935_89183936_G_A      0.0      0.0    
    4    11_89207230_89207231_T_A      0.0      0.0    
    5    11_89207617_89207618_T_A      0.0      0.0    
    6    11_89207714_89207715_C_A      0.0      0.0    
    7    11_89216311_89216312_A_C      0.0      0.0    
    8    11_89219122_89219123_T_A      0.0      0.0
    (...)
    [XX rows x YY columns]

    XX variants x YY callsets.

    index = <chrom>_<start>_<end>_<ref>_<alt>

    :param str url: The url of the ga4gh server.
    :param str chrom: The chromosome for the region of interest.
    :param int start: The start position for the region of interest.
    :param str end: The end position for the region of interest.

    :return A DataFrame of genotypes within the requested coordinates for all
    callsets.
    rtype: DataFrame
    """

    chrom = chrom.replace('chr','')
    region = chrom+":"+str(start)+"-"+str(end)
    print ("server:{}, region {}:{}-{}".format(url, chrom, start, end))

    try:
	    httpClient = client.HttpClient(url)
	    
	    # Get the datasets on the server.
	    datasets = list(httpClient.search_datasets())
	    
	    # TODO: Assumption - uses the first dataset.
	    # Get the variantSets in the first dataset.
	    variantSets = list(httpClient.search_variant_sets(
		dataset_id=datasets[0].id))

	    # TODO: Assumption - uses the first variantset.
	    # Get the variants in the interval [<start>, <end>) on chromosome <chrom>
	    # in the first variantSet.
	    callSets = list(httpClient.search_call_sets(variantSets[0].id))
	    
	    iterator = httpClient.search_variants(
		variant_set_id=variantSets[0].id,
		reference_name=chrom, start=start, end=end,
		call_set_ids=[callset.id for callset in callSets])

	    all_gts = []
	    
	    for variant in iterator:

		if snps_only and len(variant.reference_bases) > 1 and len(variant.alternate_bases) > 1:
		# Only return the bi-allelic snps
		     continue
	       

		# Use var_id as the index for the DataFrame
		# This will be used as the key to join on
		# var_id = <chrom>_<start>_<end>_<ref>_<alt>
		var_id = "_".join([
		    variant.reference_name, str(variant.start), str(variant.end),
		    variant.reference_bases, ",".join(variant.alternate_bases)])

		# Since genotypes are restricted to bi-allelic snps, the possible
		# genotypes should be 0/0, 0/1, 1,1
		# Summing this -> 0, 1, 2 are the possible genotype values
		# gts = row of the DataFrame
		#     = [var_id, genotype_callset1, genotype_callset2, ...]
		gts = [var_id] + [int(sum(call.genotype)) for call in variant.calls]
		all_gts.append(gts)


	    # columns = [var_id, callset1, callset2, ...] 
	    #print("key:{}".format(url+region))
	    df =  pda.DataFrame(all_gts,columns=['index'] + [callset.name for callset in callSets])
	    results[url+region]  = df
  
    except: print("Can not query the region:{} from server:{}".format(region,url)); raise
Ejemplo n.º 10
0
def main():
    # First, instantiate an HTTP client using the BASE_URL.

    c = client.HttpClient(BASE_URL)

    # If you are using an IDE with autocompletion (like PyCharm)
    # you should be able to access the named functions by
    # placing a `.` after the c in your editor.

    # We'll start by finding the datasets as we did in the
    # previous example.

    response = c.search_datasets()

    # Notice that the client returns a generator so we have
    # to iterate through the response to get our datasets.

    print(response)
    datasets = []

    for dataset in response:
        datasets.append(dataset)
        print(dataset)

    # We can repeat the process of collecting all variant
    # sets as was done in `hello_ga4gh` without fussing
    # with json.

    variant_sets = []

    for dataset in datasets:
        # The client provides results as classed objects,
        # so we can access their attributes using dot-notation.

        datasetId = dataset.id
        response = c.search_variant_sets(datasetId)
        for variant_set in response:
            variant_sets.append(variant_set)

    # We'll now pick out a single variant set to do some
    # analysis on.

    variant_set = variant_sets[0]
    variantSetId = variant_set.id
    variants = c.search_variants(variantSetId, 100000, 900000, "1")

    # The client manages paging for us, so there may be
    # a large number of results generated by a search.
    variant_list = []

    for variant in variants:
        variant_list.append(variant)

    print(str(len(variant_list)) + " variants.")

    # Here we will generate the same count of reference base
    # length as in the previous examples.

    reference_base_counts = {}

    for variant in variant_list:
        reference_base_length = len(variant.reference_bases)
        if reference_base_length not in reference_base_counts:
            reference_base_counts[reference_base_length] = 1
        else:
            reference_base_counts[reference_base_length] += 1

    # Did we get the same results as in `hello_ga4gh.py`?

    print(reference_base_counts)
Ejemplo n.º 11
0
from ga4gh.client import client
c = client.HttpClient("http://ga4gh_server:8000")

dataset = c.search_datasets().next()

print "Individuals:"
for individual in c.search_individuals(dataset_id=dataset.id):
    print "Individual: {}".format(individual.name)
    print " id: {}".format(individual.id)
    print " dataset_id: {}".format(individual.dataset_id)
    print " description: {}".format(individual.description)

print "RNA Quantification Sets:"
for rna_quant_set in c.search_rna_quantification_sets(dataset_id=dataset.id):
    print(" id: {}".format(rna_quant_set.id))
    print(" dataset_id: {}".format(rna_quant_set.dataset_id))
    print(" name: {}\n".format(rna_quant_set.name))

print "RNA Quantifications:"
for rna_quant in c.search_rna_quantifications(
        rna_quantification_set_id=rna_quant_set.id):
    print("RNA Quantification: {}".format(rna_quant.name))
    print(" id: {}".format(rna_quant.id))
    print(" description: {}\n".format(rna_quant.description))

print "RNA Expression Levels:"
for expression in c.search_expression_levels(
        rna_quantification_id=rna_quant.id):
    print("Expression Level: {}".format(expression.name))
    print(" id: {}".format(expression.id))
    print(" feature: {}".format(expression.feature_id))
Ejemplo n.º 12
0
def main():
    # First, instantiate an HTTP client using the BASE_URL.

    c = client.HttpClient(BASE_URL)

    # Now we'll get a variant set.

    # We can get the first item of an iterator using `.next()`.

    dataset = c.search_datasets().next()

    variant_set = c.search_variant_sets(dataset.id).next()

    # We now collect the variants in that variant set.

    variants = c.search_variants(
        variant_set.id,         # The ID of the variantSet
        start=0,                # Start position
        end=100000,             # End position
        reference_name="1")      # chrom

    # And copy them into `variant_list`

    variant_list = []

    for variant in variants:
        variant_list.append(variant)

    # Our analysis will make counts of the reference and
    # alternate base lengths, so let's grab those from
    # each variant and make lists of the lengths.

    ref_lengths = []
    alt_lengths = []

    for variant in variant_list:
        ref_lengths.append(len(variant.reference_bases))
        for base in variant.alternate_bases:
            alt_lengths.append(len(base))

    print(str(len(variant_list)) + " variants.")

    # Now we can create histograms for each of these lists.
    # see more examples http://matplotlib.org/1.2.1/examples/pylab_examples/histogram_demo.html

    plt.figure(1)

    binning = [x for x in range(1, np.max(ref_lengths) + 1)]

    n, bins, patches = plt.hist(ref_lengths, bins=binning, facecolor='red', alpha=0.75, log=True)
    plt.title("Frequency of reference base lengths")
    plt.xlabel('Length of reference')
    plt.ylabel('n variants of length (log)')
    plt.axis([0, len(n), 0, np.max(n)])

    plt.figure(2)

    binning = [x for x in range(1, np.max(alt_lengths) + 1)]

    m, binsm, patchesm = plt.hist(alt_lengths, bins=binning, facecolor='blue', alpha=0.75, log=True)
    plt.title("Frequency of alternate base lengths")
    plt.xlabel('Length of alts')
    plt.ylabel('n variants of length (log)')
    plt.axis([0, len(m), 0, np.max(m)])

    plt.show()
Ejemplo n.º 13
0
from __future__ import print_function
import ga4gh.client.client as client
simons_client = client.HttpClient("http://10.50.100.241/")
import json
"""
******************** get datasets and variant_sets ******************* 
"""
datasets = list(simons_client.search_datasets())
dataset = simons_client.get_dataset(datasets[0].id)

simons_csid_dict = {}
for biosample in simons_client.search_biosamples(dataset_id=dataset.id):
    for variant_set in simons_client.search_variant_sets(
            dataset_id=dataset.id):
        if variant_set.name == biosample.info['Individual_id'].values[
                0].string_value:
            print(variant_set.name)
            print(variant_set.id)
            callset = list(simons_client.search_call_sets(variant_set.id))[0]
            print(callset.id)
            print(biosample.info['Name'].values[0].string_value)
            simons_csid_dict[callset.id] = (
                biosample.info['Name'].values[0].string_value, variant_set.id)

with open("simons_csids.json", 'w') as output_file:
    json.dump(simons_csid_dict, output_file)