コード例 #1
0
ファイル: galaxy.py プロジェクト: bgruening/docker-recipes
def get( dataset_id, history_id = None ):
    """
        Given the history_id that is displayed to the user, this function will
        download the file from the history and stores it under /import/
        Return value is the path to the dataset stored under /import/
    """
    conf = _get_conf()
    gi = get_galaxy_connection()
    hc = HistoryClient( gi )
    dc = DatasetClient( gi )

    file_path = '/import/%s' % dataset_id
    history_id = history_id or _get_history_id()

    # Cache the file requests. E.g. in the example of someone doing something
    # silly like a get() for a Galaxy file in a for-loop, wouldn't want to
    # re-download every time and add that overhead.
    if not os.path.exists(file_path):
        dataset_mapping = dict( [(dataset['hid'], dataset['id']) for dataset in hc.show_history( history_id, contents=True )] )
        try:
            hc.download_dataset( history_id, dataset_mapping[dataset_id], file_path, use_default_filename=False, to_ext=None )
        except:
            dc.download_dataset(dataset_mapping[dataset_id], file_path, use_default_filename=False)

    return file_path
コード例 #2
0
def get(dataset_id, history_id=None):
    """
        Given the history_id that is displayed to the user, this function will
        download the file from the history and stores it under /import/
        Return value is the path to the dataset stored under /import/
    """
    conf = _get_conf()
    gi = get_galaxy_connection()
    hc = HistoryClient(gi)
    dc = DatasetClient(gi)

    file_path = '/import/%s' % dataset_id
    history_id = history_id or _get_history_id()

    # Cache the file requests. E.g. in the example of someone doing something
    # silly like a get() for a Galaxy file in a for-loop, wouldn't want to
    # re-download every time and add that overhead.
    if not os.path.exists(file_path):
        dataset_mapping = dict([
            (dataset['hid'], dataset['id'])
            for dataset in hc.show_history(history_id, contents=True)
        ])
        try:
            hc.download_dataset(history_id,
                                dataset_mapping[dataset_id],
                                file_path,
                                use_default_filename=False,
                                to_ext=None)
        except:
            dc.download_dataset(dataset_mapping[dataset_id],
                                file_path,
                                use_default_filename=False)

    return file_path
コード例 #3
0
def get( dataset_id ):
    """
        Given the history_id that is displayed to the user, this function will
        download the file from the history and stores it under /import/
        Return value is the path to the dataset stored under /import/
    """
    conf = _get_conf()
    gi = get_galaxy_connection()
    hc = HistoryClient( gi )
    dc = DatasetClient( gi )

    file_path = '/import/%s' % dataset_id

    dataset_mapping = dict( [(dataset['hid'], dataset['id']) for dataset in hc.show_history(conf['history_id'], contents=True)] )
    try:
        hc.download_dataset(conf['history_id'], dataset_mapping[dataset_id], file_path, use_default_filename=False, to_ext=None)
    except:
        dc.download_dataset(dataset_mapping[dataset_id], file_path, use_default_filename=False)

    return file_path
コード例 #4
0
def runWorkflow(argDictionary, comparisons,samples):
    from bioblend.galaxy import GalaxyInstance
    from bioblend.galaxy.histories import HistoryClient
    from bioblend.galaxy.tools import ToolClient
    from bioblend.galaxy.workflows import WorkflowClient
    from bioblend.galaxy.libraries import LibraryClient
    import tempfile
    
    
    import time
    api_key = ''
    galaxy_host = ''

    gi = GalaxyInstance(url=galaxy_host, key=api_key)

    history_client = HistoryClient(gi)
    tool_client = ToolClient(gi)
    workflow_client = WorkflowClient(gi)
    library_client = LibraryClient(gi)
    
    history = history_client.create_history(argDictionary['accessionNumber'])
    
    comparisonsTable = tool_client.upload_file(comparisons, history['id'], file_type='txt')
    sampleTable = tool_client.upload_file(samples, history['id'], file_type='tabular')
    
    if argDictionary['site'] == "ENA":
        #fastqs available on ENA    
        tool_inputs = {
                "accessionNumber":argDictionary["ENA"],"sampleTable":{'id': sampleTable['outputs'][0]['id'], 'src': 'hda'}
                
            }
        
    
        #run the tool to get the data from ENA
        tool_client.run_tool(history['id'],'getRNASeqExpressionData', tool_inputs)
        
        #we want to wait until we have all datasets
        while getNumberNotComplete(history['id'], history_client) > 0:
            time.sleep(10)
            
        
        #sleep until all the fastq files are findable
        time.sleep(120)
        
        
        dirpath = tempfile.mkdtemp()
        fileList = getDatasetsByApproxName("files.tabular", history,history_client)[0]
        fileList = history_client.download_dataset(history["id"],fileList["id"],dirpath)
        num_lines = sum(1 for line in open(fileList)) -1
        
        datasets=list()
        while len(datasets)!=num_lines:
                    time.sleep(10)
                    datasets = getDatasetsByApproxName("fastq",history,history_client )                
    else: #for SRA       
    
        if argDictionary['single'] == "TRUE":
            with open(samples) as tsvfile:
                reader = csv.DictReader(tsvfile, delimiter='\t')
                for sample in reader:
                    print (sample)
                    fileNames=str.split(sample["File"],"|")
                    for fileName in fileNames:                    
                        tool_inputs = {
                                "input|input_select":"accession_number",
                                "outputformat":"fastqsanger.gz",
                                "input|accession":fileName   
                            }
                        #run the tool to get the single data from SRA
                        tool_client.run_tool(history['id'],'toolshed.g2.bx.psu.edu/repos/iuc/sra_tools/fastq_dump/2.8.1.3', tool_inputs)
               
        else:
             with open(samples) as tsvfile:
                reader = csv.DictReader(tsvfile, delimiter='\t')
           
                for sample in reader:            
                    tool_inputs = {
                            "accession_number":sample["File"]           
                        }
                    #run the tool to get the paired data from SRA
                    tool_client.run_tool(history['id'],'toolshed.g2.bx.psu.edu/repos/mandorodriguez/fastqdump_paired/fastq_dump_paired/1.1.4', tool_inputs)
                
        while getNumberNotComplete(history['id'], history_client) > 0:
            time.sleep(10)
     
    datasets = getDatasetsByApproxName("fastq",history,history_client )
    #get the fastQC tool
    for fastq in datasets:
        try:
            tool_inputs = {'input_file' : {'id': fastq['id'], 'src': 'hda'}}
            tool_client.run_tool(history['id'],'toolshed.g2.bx.psu.edu/repos/devteam/fastqc/fastqc/0.69', tool_inputs)
        except Exception:
            pass
        
    #wait till complete
    while getNumberNotComplete(history['id'], history_client) > 0:
        time.sleep(10)
    
    #make dataset collections for quantification using the fastq files
    collections=list()
    with open(samples) as tsvfile:
        reader = csv.DictReader(tsvfile, delimiter='\t')
        for row in reader:
            datasets=list()
            fileNames=str.split(row["File"],"|")
            
            for fileName in fileNames:
                datasets= datasets + getDatasetsByApproxName(fileName,history,history_client )
                    
            #make list of datasets
            collections.append(makeDataSetCollection(datasets,row["Sample"],history,history_client))
            
            
            
    #get the correct kallisto index
    species = argDictionary['species'].lower()
    index = getLibraryToolDataID(history=history,history_client=history_client,library_client=library_client,name=species +"IndexFile")
    index = {'id': index, 'src': 'hda'}
    
    #run kallisto for every dataset collection
    for collection in collections:
        #set up the tool_inputs
        tool_inputs = {'index' : index,'inputs' : {'id': collection['id'], 'src': 'hdca'} ,"single":argDictionary["single"],"stranded":argDictionary["stranded"]}
        
        
        #often encounter connection broken error - possible problem with Certus server?
        #bypass by ignoring the exception
        tool_client.run_tool(history['id'],'kallistoQuant', tool_inputs)


    # we want to wait until we have all datasets
    while getNumberNotComplete(history['id'], history_client) > 0:
        time.sleep(10)
        
    # Run multiqc on kallisto logs and fastqc files
    datasets = getDatasetsByApproxName("RawData",history,history_client )
    kallistoLogs = getDatasetsByApproxName(".log", history, history_client)
    
    tool_inputs = {}
    for i, dataset in enumerate(datasets+kallistoLogs):
        if not dataset["deleted"]:
            if dataset in datasets:
                software = 'fastqc'
            else:
                software = 'kallisto'
            params = {'id' : dataset['id'], 'src': 'hda', 'name': dataset['name']}
            tool_inputs.update({'results_%s|software_cond|software' % i: software, 'results_%s|input_file' % i: params})

#    #summarise with the multiQC tool
    tool_client.run_tool(history['id'],'multiqc', tool_inputs)
    
    multiQc = getDatasetsByApproxName("multiqc",history,history_client)[0]
    
        
    #get all the abundance files to convert to gene level counts matrix
    datasets = getDatasetsByApproxName(".abundance",history,history_client )
    
    #make a dataset collection for to make a countsMatrix
    collection = makeDataSetCollection(datasets,"abundances",history,history_client)
    
    
    #set up the tool_inputs
    tool_inputs = {'inputs' : {'id': collection['id'], 'src': 'hdca'} ,"species":argDictionary['species']}
    
    #convert abundances to gene level counts matrix
    tool_client.run_tool(history['id'],'KallistoAbundancestoGeneCountMatrix', tool_inputs)
    
    # A diry hack, we want to wait until we have all datasets
    while getNumberNotComplete(history['id'], history_client) > 0:
        time.sleep(10)
    
    txi = getDatasetsByApproxName("txi",history,history_client)
    

    #set up the tool_inputs for PCA
    tool_inputs = {'txiData' : {'id': txi[0]['id'], 'src': 'hda'} ,'sampleTable' : {'id': sampleTable['outputs'][0]['id'], 'src': 'hda'} ,"species":argDictionary['species'],'technicalReplicates':argDictionary['technicalReplicates'],'batchCorrect':argDictionary['batchCorrect']}
    
    #run deseq2
    tool_client.run_tool(history['id'],'PCARNASeq', tool_inputs)
    
    pca = getDatasetsByApproxName("PCA",history,history_client)[0]
    
       
    #set up the tool_inputs for DESeq2
    tool_inputs = {'txiData' : {'id': txi[0]['id'], 'src': 'hda'} ,'sampleTable' : {'id': sampleTable['outputs'][0]['id'], 'src': 'hda'} ,
    'comparisonsTable' : {'id': comparisonsTable['outputs'][0]['id'], 'src': 'hda'} ,"foldChangeOnly":argDictionary['foldChangeOnly'],"species":argDictionary['species'],'technicalReplicates':argDictionary['technicalReplicates'],'batchCorrect':argDictionary['batchCorrect']}
    
    #run deseq2
    tool_client.run_tool(history['id'],'DESeq2FoldChange', tool_inputs)
         
    #run chrdir
    tool_client.run_tool(history['id'],'characteristicDirectionRNASeq', tool_inputs)
    
        #we want to wait until we have all datasets
    while getNumberNotComplete(history['id'], history_client) > 0:
        time.sleep(10)
        
        
    #get the foldchange data, cut and run pathway workflow    
    dataset_id = getFoldChangeData(history, history_client)['id']
    
    
    return_collection = [{'accessionNo':argDictionary['accessionNumber'], 'foldChange': getUrl(dataset_id), 'PCA': getUrl(pca["id"]),'chrDirTable': getUrl(getMostRecentDatasetByName('chrDirTable.tabular', history, history_client)['id'])}]
    
    
    number_of_comparisons = -1
    for line in open(comparisons):
        if not line.isspace():
            number_of_comparisons += 1

    for comparison in range(0, int(number_of_comparisons)):
        tool_inputs = {
            'foldChangeTable' : {'id': dataset_id, 'src': 'hda'},
            'comparisonNumber' : comparison + 1
        }
        tool_client.run_tool(history['id'], 'cutFoldChangeTable', tool_inputs)
        
    while getNumberNotComplete(history['id'], history_client) > 0:
        time.sleep(10)
        
        
    if argDictionary['species'] in ["Rat","Cow","Horse","Pig","Zebrafish"]:
        pathwayAnalysisWorkflow = workflow_client.show_workflow('c9468fdb6dc5c5f1')
        
        params = dict()
        for key in pathwayAnalysisWorkflow['steps'].keys():
            params[key] = argDictionary
        
        if argDictionary['species'] == "Rat":
            network=getLibraryToolDataID(history=history,history_client=history_client,library_client=library_client,name="ratStringNetwork")
            geneLengths=getLibraryToolDataID(history=history,history_client=history_client,library_client=library_client,name="ratGeneLengths")
            homology=getLibraryToolDataID(history=history,history_client=history_client,library_client=library_client,name="HOM_AllOrganism.rpt")
        if argDictionary['species'] == "Cow":
            network=getLibraryToolDataID(history=history,history_client=history_client,library_client=library_client,name="cowStringNetwork")
            geneLengths=getLibraryToolDataID(history=history,history_client=history_client,library_client=library_client,name="cowGeneLengths")
            homology=getLibraryToolDataID(history=history,history_client=history_client,library_client=library_client,name="HOM_AllOrganism.rpt")
        if argDictionary['species'] == "Horse":
            network=getLibraryToolDataID(history=history,history_client=history_client,library_client=library_client,name="horseStringNetwork")
            geneLengths=getLibraryToolDataID(history=history,history_client=history_client,library_client=library_client,name="horseGeneLengths")
            homology=getLibraryToolDataID(history=history,history_client=history_client,library_client=library_client,name="Homology.horse.txt")
        if argDictionary['species'] == "Pig":
            network=getLibraryToolDataID(history=history,history_client=history_client,library_client=library_client,name="pigStringNetwork.txt")
            geneLengths=getLibraryToolDataID(history=history,history_client=history_client,library_client=library_client,name="pigGeneLengths.tabular")
            homology=getLibraryToolDataID(history=history,history_client=history_client,library_client=library_client,name="Homology.pig.txt")
        if argDictionary['species'] == "Zebrafish":
            network=getLibraryToolDataID(history=history,history_client=history_client,library_client=library_client,name="zebrafishStringNetwork")
            geneLengths=getLibraryToolDataID(history=history,history_client=history_client,library_client=library_client,name="zebrafishGeneLengths")
            homology=getLibraryToolDataID(history=history,history_client=history_client,library_client=library_client,name="HOM_AllOrganism.rpt")
        
                
        pathwayDatamap = {'3' : {'id': homology, 'src': 'hda'},'2' : {'id': network, 'src': 'hda'},'1' : {'id': geneLengths, 'src': 'hda'}}

        diffExpDataCollection = getDatasetsByName('cutTable.tabular', history, history_client)
        for index, diffExpData in enumerate(diffExpDataCollection):
            
            numCompleted = getNumberComplete(history['id'], history_client) + 10
            print(numCompleted)
            
            pathwayDatamap["0"] = {'id': diffExpData['id'], 'src': 'hda'}
            workflow_client.invoke_workflow(pathwayAnalysisWorkflow['id'], 
                                            inputs = pathwayDatamap, 
                                            history_id = history['id'], 
                                            params = params)                  
            
            
            comparisonDict = getRowFromCsv(comparisons, index)
            
            if 'Factor1' in comparisonDict.keys():
                comparisonDict['Factor'] = comparisonDict['Factor1'] + "." + comparisonDict['Factor2']
                
            return_dict = {'accessionNo':argDictionary['accessionNumber'],
                           'factor':comparisonDict['Factor'],
                           'comparisonNum':comparisonDict['Numerator'],
                           'comparisonDenom':comparisonDict['Denominator'],
                           'foldChange': getUrl(diffExpData['id']),
                           'interactome': pathwayDatamap['0']['id'],
                           'exonLength': pathwayDatamap['2']['id']}
            
            while getNumberComplete(history['id'], history_client) < numCompleted:
                time.sleep(10)
    
            return_dict['moduleNodes'] = getUrl(getMostRecentDatasetByName('moduleNodes.text', 
                history, history_client)['id'])
            return_dict['modulePlots'] = getUrl(getMostRecentDatasetByName('modulePlots.pdf',
                history, history_client)['id'])
            return_dict['slimEnrichPathways'] = getUrl(getMostRecentDatasetByName('slimEnrichmentPathways.tabular',
                history, history_client)['id'])
            return_dict['enrichedDrugsReverse'] = getUrl(getMostRecentDatasetByName('enrichedDrugsReverse.tabular',
                history, history_client)['id'])
            return_dict['enrichedDrugsMimic'] = getUrl(getMostRecentDatasetByName('enrichedDrugsMimic.tabular',
                history, history_client)['id'])
            return_dict['enrichedTerms'] = getUrl(getMostRecentDatasetByName('enrichedTerms.tabular',
                history, history_client)['id'])
            return_dict['enrichedTerms.reduced'] = getUrl(getMostRecentDatasetByName('enrichedTerms.reduced.tabular',
                history, history_client)['id'])
            return_dict['GO.MDS'] = getUrl(getMostRecentDatasetByName('GO.MDS.html',
                history, history_client)['id'])
            return_collection.append(return_dict)
       
        # Hard code keys to define the order
        keys = ['accessionNo','multiQC','factor','PCA','chrDirTable','comparisonNum','comparisonDenom','foldChange',
        'interactome','exonLength','moduleNodes','modulePlots',
        'slimEnrichPathways','secretedProteins','enrichedDrugsReverse','enrichedDrugsMimic','enrichedTerms','enrichedTerms.reduced','GO.MDS']
        
        outFileName = 'output/' +  argDictionary['accessionNumber'] + '-workflowOutput.tsv'
        
        with open(outFileName, 'wb') as csvFile:
            # Get headers from last dictionary in collection as first doesn't contain all keys
            csvOutput = csv.DictWriter(csvFile, keys, delimiter = "\t")
            csvOutput.writeheader()
            csvOutput.writerows(return_collection)
            
        #tool_client.upload_file(outFileName, history['id'], file_type='tsv')
        
        return return_collection
    else:  
        pathwayAnalysisWorkflow = workflow_client.show_workflow('e85a3be143d5905b')
        
        params = dict()
        for key in pathwayAnalysisWorkflow['steps'].keys():
            params[key] = argDictionary
            
       
        if argDictionary['species'] == "Mouse":  
        
            network=getLibraryToolDataID(history=history,history_client=history_client,library_client=library_client,name="mouseStringNetwork")
            geneLengths=getLibraryToolDataID(history=history,history_client=history_client,library_client=library_client,name="MouseGeneLengths.tab")
            homology=getLibraryToolDataID(history=history,history_client=history_client,library_client=library_client,name="Homology.mouse.txt")
            secretedReference=getLibraryToolDataID(history=history,history_client=history_client,library_client=library_client,name="uniprot-secreted-mouse.txt")
            
            pathwayDatamap = {'4' : {'id':  secretedReference, 'src': 'hda'},'3' : {'id': homology, 'src': 'hda'},'2' : {'id': network, 'src': 'hda'},'1' : {'id': geneLengths, 'src': 'hda'}}
        else:
        
            network=getLibraryToolDataID(history=history,history_client=history_client,library_client=library_client,name="humanStringNetwork")
            geneLengths=getLibraryToolDataID(history=history,history_client=history_client,library_client=library_client,name="geneLengths")
            homology=getLibraryToolDataID(history=history,history_client=history_client,library_client=library_client,name="Homology.mouse.txt")
            secretedReference=getLibraryToolDataID(history=history,history_client=history_client,library_client=library_client,name="uniprot-secreted-human.txt")
            pathwayDatamap = {'4' : {'id':  secretedReference, 'src': 'hda'},'3' : {'id': homology, 'src': 'hda'},'2' : {'id': network, 'src': 'hda'},'1' : {'id': geneLengths, 'src': 'hda'}}
    
        diffExpDataCollection = getDatasetsByName('cutTable.tabular', history, history_client)
        for index, diffExpData in enumerate(diffExpDataCollection):
            
            numCompleted = getNumberComplete(history['id'], history_client) + 14
            print(numCompleted)
            
            pathwayDatamap["0"] = {'id': diffExpData['id'], 'src': 'hda'}

    
        
            #pathwayDatamap['1'] = {'id': diffExpData['id'], 'src': 'hda'}
            workflow_client.invoke_workflow(pathwayAnalysisWorkflow['id'], 
                                            inputs = pathwayDatamap, 
                                            history_id = history['id'], 
                                            params = params)
            comparisonDict = getRowFromCsv(comparisons, index)
            
            if 'Factor1' in comparisonDict.keys():
                comparisonDict['Factor'] = comparisonDict['Factor1'] + "." + comparisonDict['Factor2']
                
            return_dict = {'accessionNo':argDictionary['accessionNumber'],
                           'factor':comparisonDict['Factor'],
                           'comparisonNum':comparisonDict['Numerator'],
                           'comparisonDenom':comparisonDict['Denominator'],
                           'foldChange': getUrl(diffExpData['id']),
                           'interactome': pathwayDatamap['0']['id'],
                           'exonLength': pathwayDatamap['2']['id']}
            
            while getNumberComplete(history['id'], history_client) < numCompleted:
                time.sleep(10)
    
            return_dict['moduleNodes'] = getUrl(getMostRecentDatasetByName('moduleNodes.text', 
                history, history_client)['id'])
            return_dict['modulePlots'] = getUrl(getMostRecentDatasetByName('modulePlots.pdf',
                history, history_client)['id'])
            return_dict['pathways'] = getUrl(getMostRecentDatasetByName('pathways.tabular', 
                history, history_client)['id'])
            return_dict['enrichPlot'] = getUrl(getMostRecentDatasetByName('enrichmentPlot.png', 
                history, history_client)['id'])
            return_dict['enrichmentTable'] = getUrl(getMostRecentDatasetByName('TF_EnrichmentTable.tabular', 
                history, history_client)['id'])
            return_dict['slimEnrichPathways'] = getUrl(getMostRecentDatasetByName('slimEnrichmentPathways.tabular',
                history, history_client)['id'])
            return_dict['secretedProteins'] = getUrl(getMostRecentDatasetByName('secretedProteins.tabular',
                history, history_client)['id'])
            return_dict['enrichedDrugsReverse'] = getUrl(getMostRecentDatasetByName('enrichedDrugsReverse.tabular',
                history, history_client)['id'])
            return_dict['enrichedDrugsMimic'] = getUrl(getMostRecentDatasetByName('enrichedDrugsMimic.tabular',
                history, history_client)['id'])
            return_dict['enrichedTerms'] = getUrl(getMostRecentDatasetByName('enrichedTerms.tabular',
                history, history_client)['id'])
            return_dict['enrichedTerms.reduced'] = getUrl(getMostRecentDatasetByName('enrichedTerms.reduced.tabular',
                history, history_client)['id'])
            return_dict['GO.MDS'] = getUrl(getMostRecentDatasetByName('GO.MDS.html',
                history, history_client)['id'])
            return_collection.append(return_dict)
       
        # Hard code keys to define the order
        keys = ['accessionNo','multiQC','factor','PCA','chrDirTable','comparisonNum','comparisonDenom','foldChange',
        'interactome','exonLength','moduleNodes','modulePlots','pathways','enrichPlot', 'enrichmentTable',
        'slimEnrichPathways','secretedProteins','enrichedDrugsReverse','enrichedDrugsMimic','enrichedTerms','enrichedTerms.reduced','GO.MDS']
        
        outFileName = 'output/' +  argDictionary['accessionNumber'] + '-workflowOutput.tsv'
        
        with open(outFileName, 'wb') as csvFile:
            # Get headers from last dictionary in collection as first doesn't contain all keys
            csvOutput = csv.DictWriter(csvFile, keys, delimiter = "\t")
            csvOutput.writeheader()
            csvOutput.writerows(return_collection)
            
        
        return return_collection