Python FileDocStore.filter Exemples, nebula.docstore.FileDocStore.filter Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : vcf_upload.py Projet : jhl667/pcawg_tools

def run_audit(docstore, sample_list):
    doc = FileDocStore(file_path=docstore)

    master_list = []
    with open(sample_list) as handle:
        for line in handle:
            master_list.append(line.rstrip())

    results = {}
    pending = {}
    for id, entry in doc.filter(visible=True, data_type='galaxy.datatypes.tabular.Vcf'):
        if 'tags' in entry:
            sample = None
            for s in entry['tags']:
                tmp = s.split(":")
                if tmp[0] == 'sample':
                    sample = tmp[1]
            if doc.size(entry) > 0:
                results[sample] = results.get(sample, []) + [entry['name']]
    for sample, files in results.items():
        print "%s (%s) %s" % (sample, len(files), "\t".join(files))

    for sample in master_list:
        if sample not in results or len(results[sample]) < 3:
            print "missing (%s)" % (len(results.get(sample, []))), sample

Exemple #2

0

Afficher le fichier

def run_audit(docstore, sample_list):
    doc = FileDocStore(file_path=docstore)

    master_list = []
    with open(sample_list) as handle:
        for line in handle:
            master_list.append(line.rstrip())

    results = {}
    pending = {}
    for id, entry in doc.filter(visible=True, data_type='galaxy.datatypes.tabular.Vcf'):
        if 'tags' in entry:
            sample = None
            for s in entry['tags']:
                tmp = s.split(":")
                if tmp[0] == 'sample':
                    sample = tmp[1]
            if doc.size(entry) > 0:
                results[sample] = results.get(sample, []) + [entry['name']]
    for sample, files in results.items():
        print "%s (%s) %s" % (sample, len(files), "\t".join(files))

    for sample in master_list:
        if sample not in results or len(results[sample]) < 3:
            print "missing (%s)" % (len(results.get(sample, []))), sample

Exemple #3

0

Afficher le fichier

def run_errors(docstore):
    doc = FileDocStore(file_path=docstore)

    results = {}
    pending = {}
    for id, entry in doc.filter(visible=True):
        if entry.get('state', 'ok') in ['error']:
            print entry

Exemple #4

0

Afficher le fichier

Fichier : repo_util.py Projet : wangyumei-gd/pcawg_tools

def run_ls(docstore, size=False):
    doc = FileDocStore(file_path=docstore)

    for id, entry in doc.filter():
        #if doc.size(entry) > 0:
        if size:
            print id, entry.get('name', id), doc.size(entry)
        else:
            print id, entry.get('name', id)

Exemple #5

0

Afficher le fichier

Fichier : repo_util.py Projet : ChristopherWilks/pcawg_tools

def run_ls(docstore, size=False):
    doc = FileDocStore(file_path=docstore)

    for id, entry in doc.filter():
        #if doc.size(entry) > 0:
            if size:
                print id, entry.get('name', id), doc.size(entry)            
            else:
                print id, entry.get('name', id)

Exemple #6

0

Afficher le fichier

def run_timing(docstore):
    doc = FileDocStore(file_path=docstore)
    for id, entry in doc.filter():
        if 'job' in entry and 'job_metrics' in entry['job']:
            timing = None
            for met in entry['job']['job_metrics']:
                if met['name'] == 'runtime_seconds':
                    timing = met['raw_value']
            if timing is not None:
                print id, entry["name"], timing

Exemple #7

0

Afficher le fichier

Fichier : test_commandlaunch.py Projet : kellrott/nebula

    def testNebulaLaunch(self):
        input = {
            "input_file_1" :
                Target("c39ded10-6073-11e4-9803-0800200c9a66"),
            "input_file_2" :
                Target("26fd12a2-9096-4af2-a989-9e2f1cb692fe")
        }
        parameters = {
            "tail_select" : {
                "lineNum" : 3
            }
        }

        doc = FileDocStore(
            file_path=get_abspath("../test_tmp/docstore")
        )
        logging.info("Adding files to object store")
        sync_doc_dir("examples/simple_galaxy/", doc,
            uuid_set=["c39ded10-6073-11e4-9803-0800200c9a66", "26fd12a2-9096-4af2-a989-9e2f1cb692fe"]
        )
        logging.info("Creating Task")
        workflow = GalaxyWorkflow(ga_file=get_abspath("../examples/simple_galaxy/SimpleWorkflow.ga"))
        task = nebula.tasks.GalaxyWorkflowTask(
            "test_workflow",
            workflow,
            inputs=input,
            parameters=parameters
        )

        service = GalaxyService(
            docstore=doc,
            name="nosetest_galaxy",
            galaxy="bgruening/galaxy-stable:dev",
            port=20022
        )

        task_path = get_abspath("../test_tmp/test.tasks")
        service_path = get_abspath("../test_tmp/test.service")
        taskset = TaskGroup()
        taskset.append(task)
        with open(task_path, "w") as handle:
            taskset.store(handle)

        with open(service_path, "w") as handle:
            service.get_config().set_docstore_config(cache_path=get_abspath("../test_tmp/cache")).store(handle)

        env = dict(os.environ)
        if 'PYTHONPATH' in env:
            env['PYTHONPATH'] += ":" + get_abspath("../")
        else:
            env['PYTHONPATH'] = get_abspath("../")
        subprocess.check_call([get_abspath("../bin/nebula"), "run", service_path, task_path], env=env)

        for i in doc.filter():
            print json.dumps(i, indent=4)

Exemple #8

0

Afficher le fichier

def run_ls(docstore, size=False, extra=[]):
    doc = FileDocStore(file_path=docstore)

    for id, entry in doc.filter():
        #if doc.size(entry) > 0:
            extra = []
            for e in args.extra:
                extra.append( str(entry.get(e,"")) )
            if size:
                print id, entry.get('name', id), doc.size(entry), " ".join(extra)
            else:
                print id, entry.get('name', id), " ".join(extra)

Exemple #9

0

Afficher le fichier

def run_errors(docstore):
    doc = FileDocStore(file_path=docstore)

    for id, entry in doc.filter():
        if entry.get('state', '') == 'error':
            print "Dataset", id, entry.get("tags", "")
            if 'provenance' in entry:
                print "tool:", entry['provenance']['tool_id']
                print "-=-=-=-=-=-=-"
            print entry['job']['stdout']
            print "-------------"
            print entry['job']['stderr']
            print "-=-=-=-=-=-=-"

Exemple #10

0

Afficher le fichier

Fichier : test_commandlaunch.py Projet : kellrott/nebula

    def testNebulaLaunch(self):
        input = {
            "input_file_1": Target("c39ded10-6073-11e4-9803-0800200c9a66"),
            "input_file_2": Target("26fd12a2-9096-4af2-a989-9e2f1cb692fe")
        }
        parameters = {"tail_select": {"lineNum": 3}}

        doc = FileDocStore(file_path=get_abspath("../test_tmp/docstore"))
        logging.info("Adding files to object store")
        sync_doc_dir("examples/simple_galaxy/",
                     doc,
                     uuid_set=[
                         "c39ded10-6073-11e4-9803-0800200c9a66",
                         "26fd12a2-9096-4af2-a989-9e2f1cb692fe"
                     ])
        logging.info("Creating Task")
        workflow = GalaxyWorkflow(
            ga_file=get_abspath("../examples/simple_galaxy/SimpleWorkflow.ga"))
        task = nebula.tasks.GalaxyWorkflowTask("test_workflow",
                                               workflow,
                                               inputs=input,
                                               parameters=parameters)

        service = GalaxyService(docstore=doc,
                                name="nosetest_galaxy",
                                galaxy="bgruening/galaxy-stable:dev",
                                port=20022)

        task_path = get_abspath("../test_tmp/test.tasks")
        service_path = get_abspath("../test_tmp/test.service")
        taskset = TaskGroup()
        taskset.append(task)
        with open(task_path, "w") as handle:
            taskset.store(handle)

        with open(service_path, "w") as handle:
            service.get_config().set_docstore_config(
                cache_path=get_abspath("../test_tmp/cache")).store(handle)

        env = dict(os.environ)
        if 'PYTHONPATH' in env:
            env['PYTHONPATH'] += ":" + get_abspath("../")
        else:
            env['PYTHONPATH'] = get_abspath("../")
        subprocess.check_call(
            [get_abspath("../bin/nebula"), "run", service_path, task_path],
            env=env)

        for i in doc.filter():
            print json.dumps(i, indent=4)

Exemple #11

0

Afficher le fichier

def run_copy(docstore, out_docstore):
    doc = FileDocStore(file_path=docstore)

    out_doc = FileDocStore(file_path=out_docstore)

    for id, entry in doc.filter():
        if out_doc.get(id) is None:
            print "copy", id
            out_doc.put(id, entry)
            if doc.exists(entry):
                src_path = doc.get_filename(entry)
                out_doc.create(entry)
                dst_path = out_doc.get_filename(entry)
                shutil.copy(src_path, dst_path)
                out_doc.update_from_file(entry)
        else:
            #print "skip", id, doc.size(entry), out_doc.size(entry)
            if doc.size(entry) != out_doc.size(entry):
                print "mismatch", id

Exemple #12

0

Afficher le fichier

def run_query(docstore, fields, size, filters):
    doc = FileDocStore(file_path=docstore)

    filter = {}
    for k in filters:
        tmp=k.split("=")
        filter[tmp[0]] = tmp[1]

    for id, entry in doc.filter(**filter):

        if fields is None or len(fields) == 0:
            line = entry
        else:
            line = dict( (i, entry.get(i, "")) for i in fields )

        if size:
            size_value = doc.size(Target(uuid=entry['uuid']))
        else:
            size_value = ""
        
        print size_value, json.dumps(line)

Exemple #13

0

Afficher le fichier

Fichier : vcf_upload.py Projet : jhl667/pcawg_tools

def run_synapse(docstore, parent, workdir):
    doc = FileDocStore(file_path=docstore)

    syn = synapseclient.Synapse()
    syn.login()

    for id, entry in doc.filter(visible=True, data_type='galaxy.datatypes.tabular.Vcf'):
        if 'tags' in entry:
            sample = None
            for s in entry['tags']:
                tmp = s.split(":")
                if tmp[0] == 'sample':
                    sample = tmp[1]
            name = entry['name']
            name = re.sub(r'.vcf$', '', name)
            file_name = sample + "." + name + ".snv_mnv.vcf"
            target = Target(uuid=entry['uuid'])
            if doc.size(target) > 0:
                src_file = doc.get_filename(target)
                dst_file = os.path.join(workdir, file_name)
                query = "select * from entity where parentId=='%s' and name=='%s'" % (parent, file_name + ".gz")
                r = syn.query(query)['results']
                if len(r) == 0:
                    #print r
                    print dst_file
                    shutil.copy(src_file, dst_file)
                    subprocess.check_call("bgzip %s" % (dst_file), shell=True)
                    f = synapseclient.File(dst_file + ".gz", parentId = parent, name=file_name + ".gz" )
                    f.fileType = 'vcf'
                    f.pipeline = 'UCSC'
                    f.variant_type = "snv"
                    f = syn.store(f,
                        executed="https://github.com/ucsccancer/pcawg_tools"
                    )
                else:
                    print "Skipping", file_name

Exemple #14

0

Afficher le fichier

def run_synapse(docstore, parent, workdir):
    doc = FileDocStore(file_path=docstore)

    syn = synapseclient.Synapse()
    syn.login()

    for id, entry in doc.filter(visible=True, data_type='galaxy.datatypes.tabular.Vcf'):
        if 'tags' in entry:
            sample = None
            for s in entry['tags']:
                tmp = s.split(":")
                if tmp[0] == 'sample':
                    sample = tmp[1]
            name = entry['name']
            name = re.sub(r'.vcf$', '', name)
            file_name = sample + "." + name + ".snv_mnv.vcf"
            target = Target(uuid=entry['uuid'])
            if doc.size(target) > 0:
                src_file = doc.get_filename(target)
                dst_file = os.path.join(workdir, file_name)
                query = "select * from entity where parentId=='%s' and name=='%s'" % (parent, file_name + ".gz")
                r = syn.query(query)['results']
                if len(r) == 0:
                    #print r
                    print dst_file
                    shutil.copy(src_file, dst_file)
                    subprocess.check_call("bgzip %s" % (dst_file), shell=True)
                    f = synapseclient.File(dst_file + ".gz", parentId = parent, name=file_name + ".gz" )
                    f.fileType = 'vcf'
                    f.pipeline = 'UCSC'
                    f.variant_type = "snv"
                    f = syn.store(f,
                        executed="https://github.com/ucsccancer/pcawg_tools"
                    )
                else:
                    print "Skipping", file_name

Exemple #15

0

Afficher le fichier

def run_scan(docstore, workdir, keyfile, upload_url, manifest):
    doc = FileDocStore(file_path=docstore)

    file_map = {
        'broad' : {},
        'muse' : {}
    }

    wl_map = {}
    with open(manifest) as handle:
        reader = csv.DictReader(handle, delimiter="\t")
        for row in reader:
            wl_map[row['Donor_ID']] = row

    for id, entry in doc.filter(visible=True):
        if entry.get('extension', None) in ["vcf", "vcf_bgzip"]:
            if 'tags' in entry:
                sample = None
                for s in entry['tags']:
                    tmp = s.split(":")
                    if tmp[0] == 'sample':
                        sample = tmp[1]

                pipeline = None
                method = None
                call_type = None
                variant_type = None
                if entry['name'] in ['MUSE_1.0rc', 'MUSE_0.9.9.5']:
                    pipeline = "muse"
                    method = entry['name'].replace(".", "-")
                    variant_type = 'somatic'
                    call_type = 'snv_mnv'
                elif entry['name'].split(".")[0] in ['broad-dRanger', 'broad-dRanger_snowman', 'broad-snowman', 'broad-mutect' ]:
                    pipeline = "broad"
                    method = entry['name'].split(".")[0]
                    if 'somatic' in entry['name']:
                        variant_type = 'somatic'
                    elif 'germline' in entry['name']:
                        variant_type = 'germline'
                    else:
                        raise Exception("Unknown variant type")
                    if 'snv_mnv.vcf' in entry['name']:
                        call_type = 'snv_mnv'
                    elif 'sv.vcf' in entry['name']:
                        call_type = 'sv'
                    elif 'indel.vcf' in entry['name']:
                        call_type = 'indel'
                    else:
                        raise Exception("Unknown call type: %s" % (entry['name']))
                else:
                    raise Exeception("Unknown pipeline %s" % (entry['name']))

                datestr = datetime.datetime.now().strftime("%Y%m%d")
                name = "%s.%s.%s.%s.%s" % (sample, method, datestr, variant_type, call_type )

                name = re.sub(r'.vcf$', '', name)
                if entry['extension'] == 'vcf':
                    file_name = name + ".vcf"
                elif entry['extension'] == 'vcf_bgzip':
                    file_name = name + ".vcf.gz"
                print file_name
                target = Target(uuid=entry['uuid'])
                if doc.size(target) > 0:
                    src_file = doc.get_filename(target)
                    dst_file = os.path.join(workdir, file_name)
                    shutil.copy(src_file, dst_file)

                    if entry['extension'] == 'vcf':
                        subprocess.check_call( "bgzip %s" % dst_file, shell=True )
                        dst_file = dst_file + ".gz"

                    subprocess.check_call("tabix -p vcf %s" % (dst_file), shell=True)
                    shutil.move("%s.tbi" % (dst_file), "%s.idx" % (dst_file))
                    subprocess.check_call("md5sum %s | awk '{print$1}' > %s.md5" % (dst_file, dst_file), shell=True)
                    subprocess.check_call("md5sum %s.idx | awk '{print$1}' > %s.idx.md5" % (dst_file, dst_file), shell=True)

                    if sample not in file_map[pipeline]:
                        file_map[pipeline][sample] = []

                    input_file = os.path.basename(dst_file)
                    file_map[pipeline][sample].append(input_file)

    for pipeline, samples in file_map.items():
        for sample, files in samples.items():
            with open( os.path.join(workdir, "%s.%s.sh" %(pipeline, sample)), "w" ) as handle:
                input_file = os.path.basename(dst_file)
                urls = [
                    "%scghub/metadata/analysisFull/%s" % (wl_map[sample]['Normal_GNOS_endpoint'], wl_map[sample]['Normal_Analysis_ID']),
                    "%scghub/metadata/analysisFull/%s" % (wl_map[sample]['Tumour_GNOS_endpoint'], wl_map[sample]['Tumour_Analysis_ID'])
                ]
                cmd_str = "perl /opt/vcf-uploader/gnos_upload_vcf.pl"
                cmd_str += " --metadata-urls %s" % (",".join(urls))
                cmd_str += " --vcfs %s " % (",".join(files))
                cmd_str += " --vcf-md5sum-files %s " % ((",".join( ("%s.md5" % i for i in files) )))
                cmd_str += " --vcf-idxs %s" % ((",".join( ("%s.idx" % i for i in files) )))
                cmd_str += " --vcf-idx-md5sum-files %s" % ((",".join( ("%s.idx.md5" % i for i in files) )))
                cmd_str += " --outdir %s.%s.dir" % (pipeline, sample)
                cmd_str += " --key %s " % (keyfile)
                cmd_str += " --upload-url %s" % (upload_url)
                cmd_str += " --study-refname-override tcga_pancancer_vcf_test"

                handle.write("""#!/bin/bash

%s

""" % (cmd_str) )