Beispiel #1
0
def get(dataset_id, history_id=None):
    """
        Given the history_id that is displayed to the user, this function will
        download the file from the history and stores it under /import/
        Return value is the path to the dataset stored under /import/
    """
    conf = _get_conf()
    gi = get_galaxy_connection()
    hc = HistoryClient(gi)
    dc = DatasetClient(gi)

    file_path = '/import/%s' % dataset_id
    history_id = history_id or _get_history_id()

    # Cache the file requests. E.g. in the example of someone doing something
    # silly like a get() for a Galaxy file in a for-loop, wouldn't want to
    # re-download every time and add that overhead.
    if not os.path.exists(file_path):
        dataset_mapping = dict([
            (dataset['hid'], dataset['id'])
            for dataset in hc.show_history(history_id, contents=True)
        ])
        try:
            hc.download_dataset(history_id,
                                dataset_mapping[dataset_id],
                                file_path,
                                use_default_filename=False,
                                to_ext=None)
        except:
            dc.download_dataset(dataset_mapping[dataset_id],
                                file_path,
                                use_default_filename=False)

    return file_path
Beispiel #2
0
def get(dataset_id, history_id=None):
    """
        Given the history_id that is displayed to the user, this function will
        download the file from the history and stores it under /import/
        Return value is the path to the dataset stored under /import/
    """
    history_id = history_id or os.environ['HISTORY_ID']
    # The object version of bioblend is to slow in retrieving all datasets from a history
    # fallback to the non-object path
    gi = get_galaxy_connection(history_id=history_id, obj=False)
    file_path = '/import/%s' % dataset_id
    log.debug('Downloading gx=%s history=%s dataset=%s', gi, history_id,
              dataset_id)
    # Cache the file requests. E.g. in the example of someone doing something
    # silly like a get() for a Galaxy file in a for-loop, wouldn't want to
    # re-download every time and add that overhead.
    if not os.path.exists(file_path):
        hc = HistoryClient(gi)
        dc = DatasetClient(gi)
        history = hc.show_history(history_id, contents=True)
        datasets = {ds['hid']: ds['id'] for ds in history}
        dc.download_dataset(datasets[dataset_id],
                            file_path=file_path,
                            use_default_filename=False)
    else:
        log.debug('Cached, not re-downloading')

    return file_path
Beispiel #3
0
 def create_clients(self):
     '''
     Create bioblend clients for the Galaxy instance.
     '''
     # Create first client and check if the API works
     self.config_client = ConfigClient(self.instance)
     try:
         self.config_client.get_version()
         self.config_client.get_config()
     except:
         logger.error("Provided API-key does not work.")
         return False
     try:
         self.user_client = UserClient(self.instance)
         self.workflow_client = WorkflowClient(self.instance)
         self.tool_client = ToolClient(self.instance)
         self.toolshed_client = ToolShedClient(self.instance)
         self.library_client = LibraryClient(self.instance)
         self.roles_client = RolesClient(self.instance)
         self.history_client = HistoryClient(self.instance)
         self.dataset_client = DatasetClient(self.instance)
     except:
         logger.error("Error initializing other bioblend clients.")
         return False
     return True
Beispiel #4
0
def main():
    galaxyInstance = GalaxyInstance(url=GALAXY_URL, key=API_KEY)
    toolClient = ToolClient(galaxyInstance)
    histories = HistoryClient(galaxyInstance)
    workflowsClient = WorkflowClient(galaxyInstance)
    libraryClient = LibraryClient(galaxyInstance)

    brassica_library = libraryClient.get_libraries(
        name=' Evolutionary Systems Biology')
    files = libraryClient.show_library(brassica_library[0]['id'],
                                       contents=True)
    #print(files)
    itemp = 0
    for f in files:
        if f['type'] == 'folder':
            continue  # do nothing, try next
        #initial set
        #if itemp == 31:
        #	break

        #print ("Name " + f['name'])

        replicate = f['name'].split('_')[-1].split('.')[0]
        #print replicate
        if replicate == '1':
            itemp = itemp + 1
            if not (itemp >= 71 and itemp <= 92):
                continue
            base = f['name'].split('_')[:-1]
            #print base
            forward_name = f['name']
            reverse_name = '_'.join(base) + '_2.fastq.bz2'
            forward_id = f['id']
            files2 = libraryClient.show_library(brassica_library[0]['id'],
                                                contents=True)
            for f2 in files2:
                if f2['name'] == reverse_name:
                    reverse_id = f2['id']
            print forward_name
            print reverse_name
            new_history_name = f['name'].split('_')[7] + "_" + f['name'].split(
                '_')[-3] + "_" + f['name'].split('_')[-2]
            print new_history_name
            hist = histories.create_history(name=new_history_name)
            dataset_F = histories.upload_dataset_from_library(
                hist['id'], forward_id)
            dataset_R = histories.upload_dataset_from_library(
                hist['id'], reverse_id)
            datamap = {}
            datamap['0'] = {'src': 'hda', 'id': dataset_F['id']}
            datamap['1'] = {'src': 'hda', 'id': dataset_R['id']}
            workflows = workflowsClient.get_workflows(name="Maize HISAT 2.1")
            workflow = workflows[0]
            try:
                w = workflowsClient.run_workflow(workflow['id'],
                                                 datamap,
                                                 history_id=hist['id'])
            except:
                print('Next')
Beispiel #5
0
def check_histories(run, api_key, host, logger):
    galaxy_instance = GalaxyInstance(host, key=api_key)
    history_client = HistoryClient(galaxy_instance)
    history_json_d = run + '/output'
    histories = read_all_histories(history_json_d, logger)
    (all_successful, all_running, all_failed, all_except, all_waiting,
     upload_history) = get_history_status(histories, history_client, logger)
    return (all_successful, all_running, all_failed, all_except, all_waiting,
            upload_history)
def delete_galaxy_histories(pks, purge, user):
    hss = History.objects.filter(pk__in=pks)

    for hs in hss:
        git = hs.galaxyinstancetracking
        gi, gu = get_gi_gu(user, git)
        hc = HistoryClient(gi)
        hc.delete_history(hs.galaxy_id, purge)
        hs.delete()
def get_user_history(history_id=None):
    """
       Get all visible dataset infos of user history.
       Return a list of dict of each dataset.
    """
    history_id = history_id or os.environ['HISTORY_ID']
    gi = get_galaxy_connection(history_id=history_id, obj=False)
    hc = HistoryClient(gi)
    history = hc.show_history(history_id, visible=True, contents=True)
    return history
Beispiel #8
0
def transfer_filelist_from_ftp(gi, filelist, history_name):

    tc = ToolClient(gi)
    hc = HistoryClient(gi)

    st = get_time_stamp()
    hist = hc.create_history('{}-{}'.format(history_name, st))

    uploaded_files = []
    for f in filelist:
        upf = tc.upload_from_ftp(path=os.path.basename(f),
                                 history_id=hist['id'])['outputs'][0]
        print(upf)
        uploaded_files.append(upf)
    return uploaded_files, hist
def main():
    galaxyInstance = GalaxyInstance(url=GALAXY_URL, key=API_KEY)
    toolClient = ToolClient(galaxyInstance)
    histories = HistoryClient(galaxyInstance)
    workflowsClient = WorkflowClient(galaxyInstance)
    libraryClient = LibraryClient(galaxyInstance)

    brassica_library = libraryClient.get_libraries(
        name=' Evolutionary Systems Biology')
    files = libraryClient.show_library(brassica_library[0]['id'],
                                       contents=True)
    #print(files)

    for f in files:
        if f['type'] == 'folder':
            continue  # do nothing, try next
        #initial set
        #if itemp == 31:
        #	break

        #print ("Name " + f['name'])

        replicate = f['name'].split('/')[-1][0]
        #print replicate
        if replicate == 'X':

            base = f['name'].split('/')[-1].split('.')[0]
            #print base
            forward_name = f['name']
            forward_id = f['id']
            print forward_name

            new_history_name = base
            print new_history_name
            hist = histories.create_history(name=new_history_name)
            dataset_F = histories.upload_dataset_from_library(
                hist['id'], forward_id)
            datamap = {}
            datamap['0'] = {'src': 'hda', 'id': dataset_F['id']}
            workflows = workflowsClient.get_workflows(
                name="Maize Small samples HISAT 2.1")
            workflow = workflows[0]
            try:
                w = workflowsClient.run_workflow(workflow['id'],
                                                 datamap,
                                                 history_id=hist['id'])
            except:
                print('Next')
def get_history_data(pk, user, name_filter=None, data_type=None):
    hs = History.objects.get(pk=pk)
    git = hs.galaxyinstancetracking
    gi, gu = get_gi_gu(user, git)
    hc = HistoryClient(gi)
    hdatasets = hc.show_matching_datasets(hs.galaxy_id)

    if data_type:
        hdatasets = [h for h in hdatasets if h['extension'] in data_type]

    if name_filter:
        hdatasets = [h for h in hdatasets if h['name'] in name_filter]

    for h in hdatasets:
        h['galaxy_instance'] = git.name
        h['galaxy_instance_id'] = git.pk
        h['history_internal_id'] = pk

    return hdatasets
Beispiel #11
0
def get_workflow_status(user):
    # go through every galaxy instance
    gits = GalaxyInstanceTracking.objects.filter(
        galaxyuser__internal_user=user)
    dj_wfs = Workflow.objects.all()
    # loop through instances
    status = []
    for git in gits:
        ## loop through workflows for that instance
        gi, gu = get_gi_gu(user, git)
        wc = WorkflowClient(gi)
        hc = HistoryClient(gi)
        wfs = wc.get_workflows()
        for wf in wfs:
            wfd = wc.show_workflow(wf['id'])
            winvoke = wc.get_invocations(wf['id'])
            for wi in winvoke:
                wid = wc.show_invocation(wf['id'], wi['id'])
                h_l = hc.get_histories(wid['history_id'], deleted=True)

                if h_l:
                    h = h_l[0]
                else:
                    continue
                sd = get_status_d(wid)
                sd['name'] = wfd['name']
                hd = hc.show_history(h['id'])
                sd['history_name'] = h['name']
                datetime_object = datetime.strptime(hd['update_time'],
                                                    '%Y-%m-%dT%H:%M:%S.%f')
                # sd['history_url'] =  '{}{}'.format(git.url, hd['url'])

                sd['update_time'] = datetime_object.strftime(
                    '%Y-%m-%d %H:%M:%S')
                sd['update_time_unix'] = unixtime(datetime_object)
                sd['galaxy_instance'] = git.name
                status.append(sd)

    status = sorted(status, key=lambda k: k['update_time_unix'], reverse=True)

    return status
def main():
    galaxyInstance = GalaxyInstance(url=GALAXY_URL, key=API_KEY)
    toolClient = ToolClient(galaxyInstance)
    historyClient = HistoryClient(galaxyInstance)
    workflowsClient = WorkflowClient(galaxyInstance)
    libraryClient = LibraryClient(galaxyInstance)
    datasetClient = DatasetClient(galaxyInstance)

    histories = historyClient.get_histories(deleted=False)
    for hist in histories:
        hist_id = hist['id']
        countSecondary = historyClient.show_matching_datasets(
            hist_id, name_filter=name_filter)
        if len(countSecondary) != 0:
            #print(countSecondary)
            file_path = dir_name + '/' + hist[
                'name'] + '_' + name_filter + '.' + ext
            #print(file_path)
            #print(countSecondary[0]['dataset_id'])
            datasetClient.download_dataset(countSecondary[0]['id'],
                                           file_path=file_path,
                                           use_default_filename=False)
    sys.exit()
def init_history_data_save_form(user, history_internal_id, galaxy_dataset_id):

    h = History.objects.get(pk=history_internal_id)

    gi, gu = get_gi_gu(user, h.galaxyinstancetracking)

    # save temp history object
    hc = HistoryClient(gi)

    history_d = hc.show_dataset(history_id=h.galaxy_id,
                                dataset_id=galaxy_dataset_id)

    history_d['full_download_url'] = h.galaxyinstancetracking.url + history_d[
        'download_url']

    history_d['abs_pth'] = ''

    data_pth = history_d['file_name'].replace('/export/', '')
    fullpth = os.path.join(h.galaxyinstancetracking.galaxy_root_path, data_pth)

    if os.path.exists(fullpth):
        history_d['abs_pth'] = fullpth
    print('ABS_PTH {}'.format(history_d['abs_pth']))
    return history_d
Beispiel #14
0
def getGalaxyData(accession, dataType, species, foldChangeOnly):

    api_key = 'ENTER_API_KEY'
    galaxy_host = 'http://localhost:8080'

    gi = GalaxyInstance(url=galaxy_host, key=api_key)

    history_client = HistoryClient(gi)

    dataDirectory = "Sybil/Shiny/data/" + accession
    if not os.path.exists(dataDirectory):
        os.makedirs(dataDirectory)

    wwwDirectory = "Shiny/www/microarrayQC.html/" + accession
    if not os.path.exists(wwwDirectory):
        os.makedirs(wwwDirectory)

    wwwDirectoryPlots = "Shiny/www/plots/" + accession
    if not os.path.exists(wwwDirectoryPlots):
        os.makedirs(wwwDirectoryPlots)

    #get the most recent history
    history = history_client.get_histories(name=accession)[0]

    #get the experiment level data
    getPCA(history, history_client, dataDirectory, galaxy_host)
    getChrDirTable(history, history_client, dataDirectory, galaxy_host)
    if dataType == "Microarray":
        getQC(history, history_client, wwwDirectory, galaxy_host)
    comparisons = getComparisonsTable(history, history_client, dataDirectory,
                                      galaxy_host)

    number_of_comparisons = -1
    for line in open(comparisons):
        if not line.isspace():
            number_of_comparisons += 1

    if foldChangeOnly == "FALSE":
        pvalues = ["1", "0.05"]
        foldchanges = ["1", "1.5", "2"]
        thresholds = list(itertools.product(pvalues, foldchanges))
        thresholds.pop(0)
    else:
        pvalues = ["1"]
        foldchanges = ["1.5", "2"]
        thresholds = list(itertools.product(pvalues, foldchanges))

    for i in reversed(range(number_of_comparisons)):

        getFoldChange(i, history, history_client, dataDirectory, galaxy_host)

    for index, values in reversed(
            list(
                enumerate(
                    list(
                        itertools.product(range(number_of_comparisons),
                                          thresholds))))):

        (comparison, (pvalue, foldchange)) = values

        print(index)
        print(values)

        getStringNetworks(index, comparison, history, history_client,
                          dataDirectory, galaxy_host)
        getBioGridNetworks(index, comparison, history, history_client,
                           dataDirectory, galaxy_host)

        getPathways(index, comparison, pvalue, foldchange, history,
                    history_client, dataDirectory, galaxy_host)

        getDrugEnrichment(index, comparison, pvalue, foldchange, history,
                          history_client, dataDirectory, galaxy_host)
        getGOEnrichment(index, comparison, pvalue, foldchange, history,
                        history_client, dataDirectory, wwwDirectoryPlots,
                        galaxy_host)

        if species in ["Human", "Mouse"]:
            getTFs(index, comparison, pvalue, foldchange, history,
                   history_client, dataDirectory, galaxy_host)
def runWorkflow(argDictionary, comparisons):
    from bioblend.galaxy import GalaxyInstance
    from bioblend.galaxy.histories import HistoryClient
    from bioblend.galaxy.tools import ToolClient
    from bioblend.galaxy.workflows import WorkflowClient
    from bioblend.galaxy.libraries import LibraryClient
    import time
    
    api_key = ''
    galaxy_host = 'http://localhost:8080/'

    gi = GalaxyInstance(url=galaxy_host, key=api_key)

    history_client = HistoryClient(gi)
    tool_client = ToolClient(gi)
    workflow_client = WorkflowClient(gi)
    library_client = LibraryClient(gi)
    
    history = history_client.create_history(row['accessionNumber'])
    # Import the galaxy workflow
    workflow = workflow_client.show_workflow('a799d38679e985db')

    input_file = tool_client.upload_file(comparisons, history['id'], file_type='txt')

    # Run workflow on csv data to create a new history.
    params = dict()
    for key in workflow['steps'].keys():
        params[key] = argDictionary
    
    datamap = {'1' : {'id': input_file['outputs'][0]['id'], 'src': 'hda'}}

    workflow_client.invoke_workflow(workflow['id'], inputs = datamap, history_id = history['id'], params = params)
    
    # A diry hack, we want to wait until we have all datasets
    while getNumberNotComplete(history['id'], history_client) > 0:
        time.sleep(10)
        
    
    dataset_id = getFoldChangeData(history, history_client)['id']

    
    return_collection = [{'accessionNo':argDictionary['accessionNumber'], 'foldChange': getUrl(dataset_id),
    'PCA': getUrl(getMostRecentDatasetByName('PCAplot.png', history, history_client)['id']),'chrDirTable': getUrl(getMostRecentDatasetByName('chrDirTable.tabular', history, history_client)['id'])}]
    
    number_of_comparisons = -1
    for line in open(comparisons):
        if not line.isspace():
            number_of_comparisons += 1

    for comparison in range(0, int(number_of_comparisons)):
        tool_inputs = {
            'foldChangeTable' : {'id': dataset_id, 'src': 'hda'},
            'comparisonNumber' : comparison + 1
        }
        tool_client.run_tool(history['id'], 'cutFoldChangeTable', tool_inputs)
        
    while getNumberNotComplete(history['id'], history_client) > 0:
        time.sleep(10)
        
    if argDictionary['species'] in ["Rat","Cow","Horse","Pig","Zebrafish"]:
        pathwayAnalysisWorkflow = workflow_client.show_workflow('c9468fdb6dc5c5f1')
        
        params = dict()
        for key in pathwayAnalysisWorkflow['steps'].keys():
            params[key] = argDictionary
        
        if argDictionary['species'] == "Rat":
            network=getLibraryToolDataID(history=history,history_client=history_client,library_client=library_client,name="ratStringNetwork")
            geneLengths=getLibraryToolDataID(history=history,history_client=history_client,library_client=library_client,name="ratGeneLengths")
            homology=getLibraryToolDataID(history=history,history_client=history_client,library_client=library_client,name="Homology.rat.txt")
        if argDictionary['species'] == "Cow":
            network=getLibraryToolDataID(history=history,history_client=history_client,library_client=library_client,name="cowStringNetwork")
            geneLengths=getLibraryToolDataID(history=history,history_client=history_client,library_client=library_client,name="cowGeneLengths")
            homology=getLibraryToolDataID(history=history,history_client=history_client,library_client=library_client,name="Homology.cow.txt")
        if argDictionary['species'] == "Horse":
            network=getLibraryToolDataID(history=history,history_client=history_client,library_client=library_client,name="horseStringNetwork")
            geneLengths=getLibraryToolDataID(history=history,history_client=history_client,library_client=library_client,name="horseGeneLengths")
            homology=getLibraryToolDataID(history=history,history_client=history_client,library_client=library_client,name="Homology.horse.txt")
        if argDictionary['species'] == "Pig":
            network=getLibraryToolDataID(history=history,history_client=history_client,library_client=library_client,name="pigStringNetwork.txt")
            geneLengths=getLibraryToolDataID(history=history,history_client=history_client,library_client=library_client,name="pigGeneLengths.tabular")
            homology=getLibraryToolDataID(history=history,history_client=history_client,library_client=library_client,name="Homology.pig.txt")
        if argDictionary['species'] == "Zebrafish":
            network=getLibraryToolDataID(history=history,history_client=history_client,library_client=library_client,name="zebrafishStringNetwork")
            geneLengths=getLibraryToolDataID(history=history,history_client=history_client,library_client=library_client,name="zebrafishGeneLengths")
            homology=getLibraryToolDataID(history=history,history_client=history_client,library_client=library_client,name="HOM_AllOrganism.rpt")
        
                
        pathwayDatamap = {'3' : {'id': homology, 'src': 'hda'},'2' : {'id': network, 'src': 'hda'},'1' : {'id': geneLengths, 'src': 'hda'}}

        diffExpDataCollection = getDatasetsByName('cutTable.tabular', history, history_client)
        for index, diffExpData in enumerate(diffExpDataCollection):
            
            numCompleted = getNumberComplete(history['id'], history_client) + 10
            print(numCompleted)
            
            pathwayDatamap["0"] = {'id': diffExpData['id'], 'src': 'hda'}
            workflow_client.invoke_workflow(pathwayAnalysisWorkflow['id'], 
                                            inputs = pathwayDatamap, 
                                            history_id = history['id'], 
                                            params = params)                  
            
            
            comparisonDict = getRowFromCsv(comparisons, index)
            
            if 'Factor1' in comparisonDict.keys():
                comparisonDict['Factor'] = comparisonDict['Factor1'] + "." + comparisonDict['Factor2']
                
            if 'Paired1' in comparisonDict.keys():
                comparisonDict['Factor'] = comparisonDict['Paired1']
                
            return_dict = {'accessionNo':argDictionary['accessionNumber'],
                           'factor':comparisonDict['Factor'],
                           'comparisonNum':comparisonDict['Numerator'],
                           'comparisonDenom':comparisonDict['Denominator'],
                           'foldChange': getUrl(diffExpData['id']),
                           'interactome': pathwayDatamap['0']['id'],
                           'exonLength': pathwayDatamap['2']['id']}
            
            while getNumberComplete(history['id'], history_client) < numCompleted:
                time.sleep(10)
    
            return_dict['moduleNodes'] = getUrl(getMostRecentDatasetByName('moduleNodes.text', 
                history, history_client)['id'])
            return_dict['modulePlots'] = getUrl(getMostRecentDatasetByName('modulePlots.pdf',
            history, history_client)['id'])
            return_dict['slimEnrichmentPathways'] = getUrl(getMostRecentDatasetByName('slimEnrichmentPathways.tabular',
            history, history_client)['id'])
            return_dict['slimEnrichmentPlot'] = getUrl(getMostRecentDatasetByName('slimEnrichmentPlot.png',
            history, history_client)['id'])
            return_collection.append(return_dict)     
       
        # Hard code keys to define the order
        keys = ['accessionNo','factor','comparisonNum','comparisonDenom','PCA','chrDirTable','foldChange',
        'interactome','exonLength','moduleNodes','modulePlots','enrichmentTable','slimEnrichmentPathways','slimEnrichmentPlot']
        with open('output/' +  argDictionary['accessionNumber'] + '-workflowOutput.csv', 'wb') as csvFile:
            # Get headers from last dictionary in collection as first doesn't contain all keys
            csvOutput = csv.DictWriter(csvFile, keys)
            csvOutput.writeheader()
            csvOutput.writerows(return_collection)
            
        return return_collection
    else: 
        pathwayAnalysisWorkflow = workflow_client.show_workflow('e85a3be143d5905b')
        
        params = dict()
        for key in pathwayAnalysisWorkflow['steps'].keys():
            params[key] = argDictionary
            
        # MouseGeneLengths.tab has id 457f69dd7016f307 - step 2 of workflow
        # Mouse interactome has id 073be90ac6c3bce5 - step 0 of workflow
        
        if argDictionary['species'] == "Mouse":  
    
            network=getLibraryToolDataID(history=history,history_client=history_client,library_client=library_client,name="mouseStringNetwork")
            geneLengths=getLibraryToolDataID(history=history,history_client=history_client,library_client=library_client,name="MouseGeneLengths.tab")
            homology=getLibraryToolDataID(history=history,history_client=history_client,library_client=library_client,name="Homology.mouse.txt")
            secretedReference=getLibraryToolDataID(history=history,history_client=history_client,library_client=library_client,name="uniprot-secreted-mouse.txt")
            
            pathwayDatamap = {'4' : {'id':  secretedReference, 'src': 'hda'},'3' : {'id': homology, 'src': 'hda'},'2' : {'id': network, 'src': 'hda'},'1' : {'id': geneLengths, 'src': 'hda'}}
        else:
        
            network=getLibraryToolDataID(history=history,history_client=history_client,library_client=library_client,name="humanStringNetwork")
            geneLengths=getLibraryToolDataID(history=history,history_client=history_client,library_client=library_client,name="geneLengths")
            homology=getLibraryToolDataID(history=history,history_client=history_client,library_client=library_client,name="Homology.mouse.txt")
            secretedReference=getLibraryToolDataID(history=history,history_client=history_client,library_client=library_client,name="uniprot-secreted-human.txt")
            pathwayDatamap = {'4' : {'id':  secretedReference, 'src': 'hda'},'3' : {'id': homology, 'src': 'hda'},'2' : {'id': network, 'src': 'hda'},'1' : {'id': geneLengths, 'src': 'hda'}}
    
        diffExpDataCollection = getDatasetsByName('cutTable.tabular', history, history_client)
        for index, diffExpData in enumerate(diffExpDataCollection):
            
            numCompleted = getNumberComplete(history['id'], history_client) + 14
            print(numCompleted)
            
            pathwayDatamap["0"] = {'id': diffExpData['id'], 'src': 'hda'}

            workflow_client.invoke_workflow(pathwayAnalysisWorkflow['id'], 
                                            inputs = pathwayDatamap, 
                                            history_id = history['id'], 
                                            params = params)                  
            
            
            comparisonDict = getRowFromCsv(comparisons, index)
            
            if 'Factor1' in comparisonDict.keys():
                comparisonDict['Factor'] = comparisonDict['Factor1'] + "." + comparisonDict['Factor2']
                
            if 'Paired1' in comparisonDict.keys():
                comparisonDict['Factor'] = comparisonDict['Paired1']
                
            return_dict = {'accessionNo':argDictionary['accessionNumber'],
                           'factor':comparisonDict['Factor'],
                           'comparisonNum':comparisonDict['Numerator'],
                           'comparisonDenom':comparisonDict['Denominator'],
                           'foldChange': getUrl(diffExpData['id']),
                           'interactome': pathwayDatamap['0']['id'],
                           'exonLength': pathwayDatamap['2']['id']}
            
            while getNumberComplete(history['id'], history_client) < numCompleted:
                time.sleep(10)
    
            return_dict['moduleNodes'] = getUrl(getMostRecentDatasetByName('moduleNodes.text', 
                history, history_client)['id'])
            return_dict['modulePlots'] = getUrl(getMostRecentDatasetByName('modulePlots.pdf',
            history, history_client)['id'])
            return_dict['pathways'] = getUrl(getMostRecentDatasetByName('pathways.tabular', 
                history, history_client)['id'])
            return_dict['enrichPlot'] = getUrl(getMostRecentDatasetByName('enrichmentPlot.png', 
                history, history_client)['id'])
            return_dict['enrichmentTable'] = getUrl(getMostRecentDatasetByName('TF_EnrichmentTable.tabular', 
                history, history_client)['id'])
            return_dict['slimEnrichmentPathways'] = getUrl(getMostRecentDatasetByName('slimEnrichmentPathways.tabular',
            history, history_client)['id'])
            return_dict['slimEnrichmentPlot'] = getUrl(getMostRecentDatasetByName('slimEnrichmentPlot.png',
            history, history_client)['id'])
            return_collection.append(return_dict)     
       
        # Hard code keys to define the order
        keys = ['accessionNo','factor','comparisonNum','comparisonDenom','PCA','chrDirTable','foldChange',
        'interactome','exonLength','moduleNodes','modulePlots','pathways','enrichPlot','enrichmentTable','slimEnrichmentPathways','slimEnrichmentPlot']
        with open('output/' +  argDictionary['accessionNumber'] + '-workflowOutput.csv', 'wb') as csvFile:
            # Get headers from last dictionary in collection as first doesn't contain all keys
            csvOutput = csv.DictWriter(csvFile, keys)
            csvOutput.writeheader()
            csvOutput.writerows(return_collection)
            
        return return_collection
Beispiel #16
0
def get_workflow_inputs(l, pkd, gi, git, history_name, library):
    # LibraryDatasetDatasetAssociation (ldda), LibraryDataset (ld), HistoryDatasetAssociation (hda),
    # or HistoryDatasetCollectionAssociation (hdca).
    st = get_time_stamp()

    hc = HistoryClient(gi)
    worklow_inputs_d = {}

    for table, filter, dinput_name, dinput_step, dinput_type in l:
        pks = pkd[str(table.prefix)]

        #  will get multiple inputs here because we can multiple galaxyfilelinks per file. They are all the same
        # file so we can just get unique
        selected_objects = GenericFile.objects.filter(pk__in=pks).distinct()

        print('PKS', pks, dinput_type)
        print(selected_objects)

        if dinput_type == 'data_input':

            # can only use the first selection (need to use data collection for multiple files, currently this
            # approach doesn't support using 'multiple files' as input as not possible with BioBlend (i think)
            s = selected_objects[0]
            gid = s.galaxyfilelink_set.filter(
                galaxy_library=library)[0].galaxy_id

            print(gid)

            worklow_inputs_d[dinput_step] = {'id': gid, 'src': 'ld'}

        elif dinput_type == 'data_collection_input':

            element_identifiers = []
            hist = hc.create_history('{}-(data-history-{})-{}'.format(
                history_name, dinput_name, st))

            for s in selected_objects:
                print(s)
                gfl = s.galaxyfilelink_set.filter(galaxy_library=library)[0]

                if library:
                    dataset = hc.upload_dataset_from_library(
                        hist['id'], lib_dataset_id=gfl.galaxy_id)
                    element_identifiers.append({
                        'id':
                        dataset['id'],
                        'name':
                        os.path.basename(dataset['file_name']),
                        'src':
                        'hda'
                    })
                else:
                    element_identifiers.append({
                        'id':
                        gfl.galaxy_id,
                        'name':
                        gfl.genericfile.data_file.name,
                        'src':
                        'hda'
                    })

            c_descript = {
                'collection_type': 'list',
                'element_identifiers': element_identifiers,
                'name': dinput_name,
            }

            dc = hc.create_dataset_collection(hist['id'], c_descript)
            worklow_inputs_d[dinput_step] = {'id': dc['id'], 'src': 'hdca'}

    return worklow_inputs_d
Beispiel #17
0
def runWorkflow(argDictionary, comparisons,samples):
    from bioblend.galaxy import GalaxyInstance
    from bioblend.galaxy.histories import HistoryClient
    from bioblend.galaxy.tools import ToolClient
    from bioblend.galaxy.workflows import WorkflowClient
    from bioblend.galaxy.libraries import LibraryClient
    import tempfile
    
    
    import time
    api_key = ''
    galaxy_host = ''

    gi = GalaxyInstance(url=galaxy_host, key=api_key)

    history_client = HistoryClient(gi)
    tool_client = ToolClient(gi)
    workflow_client = WorkflowClient(gi)
    library_client = LibraryClient(gi)
    
    history = history_client.create_history(argDictionary['accessionNumber'])
    
    comparisonsTable = tool_client.upload_file(comparisons, history['id'], file_type='txt')
    sampleTable = tool_client.upload_file(samples, history['id'], file_type='tabular')
    
    if argDictionary['site'] == "ENA":
        #fastqs available on ENA    
        tool_inputs = {
                "accessionNumber":argDictionary["ENA"],"sampleTable":{'id': sampleTable['outputs'][0]['id'], 'src': 'hda'}
                
            }
        
    
        #run the tool to get the data from ENA
        tool_client.run_tool(history['id'],'getRNASeqExpressionData', tool_inputs)
        
        #we want to wait until we have all datasets
        while getNumberNotComplete(history['id'], history_client) > 0:
            time.sleep(10)
            
        
        #sleep until all the fastq files are findable
        time.sleep(120)
        
        
        dirpath = tempfile.mkdtemp()
        fileList = getDatasetsByApproxName("files.tabular", history,history_client)[0]
        fileList = history_client.download_dataset(history["id"],fileList["id"],dirpath)
        num_lines = sum(1 for line in open(fileList)) -1
        
        datasets=list()
        while len(datasets)!=num_lines:
                    time.sleep(10)
                    datasets = getDatasetsByApproxName("fastq",history,history_client )                
    else: #for SRA       
    
        if argDictionary['single'] == "TRUE":
            with open(samples) as tsvfile:
                reader = csv.DictReader(tsvfile, delimiter='\t')
                for sample in reader:
                    print (sample)
                    fileNames=str.split(sample["File"],"|")
                    for fileName in fileNames:                    
                        tool_inputs = {
                                "input|input_select":"accession_number",
                                "outputformat":"fastqsanger.gz",
                                "input|accession":fileName   
                            }
                        #run the tool to get the single data from SRA
                        tool_client.run_tool(history['id'],'toolshed.g2.bx.psu.edu/repos/iuc/sra_tools/fastq_dump/2.8.1.3', tool_inputs)
               
        else:
             with open(samples) as tsvfile:
                reader = csv.DictReader(tsvfile, delimiter='\t')
           
                for sample in reader:            
                    tool_inputs = {
                            "accession_number":sample["File"]           
                        }
                    #run the tool to get the paired data from SRA
                    tool_client.run_tool(history['id'],'toolshed.g2.bx.psu.edu/repos/mandorodriguez/fastqdump_paired/fastq_dump_paired/1.1.4', tool_inputs)
                
        while getNumberNotComplete(history['id'], history_client) > 0:
            time.sleep(10)
     
    datasets = getDatasetsByApproxName("fastq",history,history_client )
    #get the fastQC tool
    for fastq in datasets:
        try:
            tool_inputs = {'input_file' : {'id': fastq['id'], 'src': 'hda'}}
            tool_client.run_tool(history['id'],'toolshed.g2.bx.psu.edu/repos/devteam/fastqc/fastqc/0.69', tool_inputs)
        except Exception:
            pass
        
    #wait till complete
    while getNumberNotComplete(history['id'], history_client) > 0:
        time.sleep(10)
    
    #make dataset collections for quantification using the fastq files
    collections=list()
    with open(samples) as tsvfile:
        reader = csv.DictReader(tsvfile, delimiter='\t')
        for row in reader:
            datasets=list()
            fileNames=str.split(row["File"],"|")
            
            for fileName in fileNames:
                datasets= datasets + getDatasetsByApproxName(fileName,history,history_client )
                    
            #make list of datasets
            collections.append(makeDataSetCollection(datasets,row["Sample"],history,history_client))
            
            
            
    #get the correct kallisto index
    species = argDictionary['species'].lower()
    index = getLibraryToolDataID(history=history,history_client=history_client,library_client=library_client,name=species +"IndexFile")
    index = {'id': index, 'src': 'hda'}
    
    #run kallisto for every dataset collection
    for collection in collections:
        #set up the tool_inputs
        tool_inputs = {'index' : index,'inputs' : {'id': collection['id'], 'src': 'hdca'} ,"single":argDictionary["single"],"stranded":argDictionary["stranded"]}
        
        
        #often encounter connection broken error - possible problem with Certus server?
        #bypass by ignoring the exception
        tool_client.run_tool(history['id'],'kallistoQuant', tool_inputs)


    # we want to wait until we have all datasets
    while getNumberNotComplete(history['id'], history_client) > 0:
        time.sleep(10)
        
    # Run multiqc on kallisto logs and fastqc files
    datasets = getDatasetsByApproxName("RawData",history,history_client )
    kallistoLogs = getDatasetsByApproxName(".log", history, history_client)
    
    tool_inputs = {}
    for i, dataset in enumerate(datasets+kallistoLogs):
        if not dataset["deleted"]:
            if dataset in datasets:
                software = 'fastqc'
            else:
                software = 'kallisto'
            params = {'id' : dataset['id'], 'src': 'hda', 'name': dataset['name']}
            tool_inputs.update({'results_%s|software_cond|software' % i: software, 'results_%s|input_file' % i: params})

#    #summarise with the multiQC tool
    tool_client.run_tool(history['id'],'multiqc', tool_inputs)
    
    multiQc = getDatasetsByApproxName("multiqc",history,history_client)[0]
    
        
    #get all the abundance files to convert to gene level counts matrix
    datasets = getDatasetsByApproxName(".abundance",history,history_client )
    
    #make a dataset collection for to make a countsMatrix
    collection = makeDataSetCollection(datasets,"abundances",history,history_client)
    
    
    #set up the tool_inputs
    tool_inputs = {'inputs' : {'id': collection['id'], 'src': 'hdca'} ,"species":argDictionary['species']}
    
    #convert abundances to gene level counts matrix
    tool_client.run_tool(history['id'],'KallistoAbundancestoGeneCountMatrix', tool_inputs)
    
    # A diry hack, we want to wait until we have all datasets
    while getNumberNotComplete(history['id'], history_client) > 0:
        time.sleep(10)
    
    txi = getDatasetsByApproxName("txi",history,history_client)
    

    #set up the tool_inputs for PCA
    tool_inputs = {'txiData' : {'id': txi[0]['id'], 'src': 'hda'} ,'sampleTable' : {'id': sampleTable['outputs'][0]['id'], 'src': 'hda'} ,"species":argDictionary['species'],'technicalReplicates':argDictionary['technicalReplicates'],'batchCorrect':argDictionary['batchCorrect']}
    
    #run deseq2
    tool_client.run_tool(history['id'],'PCARNASeq', tool_inputs)
    
    pca = getDatasetsByApproxName("PCA",history,history_client)[0]
    
       
    #set up the tool_inputs for DESeq2
    tool_inputs = {'txiData' : {'id': txi[0]['id'], 'src': 'hda'} ,'sampleTable' : {'id': sampleTable['outputs'][0]['id'], 'src': 'hda'} ,
    'comparisonsTable' : {'id': comparisonsTable['outputs'][0]['id'], 'src': 'hda'} ,"foldChangeOnly":argDictionary['foldChangeOnly'],"species":argDictionary['species'],'technicalReplicates':argDictionary['technicalReplicates'],'batchCorrect':argDictionary['batchCorrect']}
    
    #run deseq2
    tool_client.run_tool(history['id'],'DESeq2FoldChange', tool_inputs)
         
    #run chrdir
    tool_client.run_tool(history['id'],'characteristicDirectionRNASeq', tool_inputs)
    
        #we want to wait until we have all datasets
    while getNumberNotComplete(history['id'], history_client) > 0:
        time.sleep(10)
        
        
    #get the foldchange data, cut and run pathway workflow    
    dataset_id = getFoldChangeData(history, history_client)['id']
    
    
    return_collection = [{'accessionNo':argDictionary['accessionNumber'], 'foldChange': getUrl(dataset_id), 'PCA': getUrl(pca["id"]),'chrDirTable': getUrl(getMostRecentDatasetByName('chrDirTable.tabular', history, history_client)['id'])}]
    
    
    number_of_comparisons = -1
    for line in open(comparisons):
        if not line.isspace():
            number_of_comparisons += 1

    for comparison in range(0, int(number_of_comparisons)):
        tool_inputs = {
            'foldChangeTable' : {'id': dataset_id, 'src': 'hda'},
            'comparisonNumber' : comparison + 1
        }
        tool_client.run_tool(history['id'], 'cutFoldChangeTable', tool_inputs)
        
    while getNumberNotComplete(history['id'], history_client) > 0:
        time.sleep(10)
        
        
    if argDictionary['species'] in ["Rat","Cow","Horse","Pig","Zebrafish"]:
        pathwayAnalysisWorkflow = workflow_client.show_workflow('c9468fdb6dc5c5f1')
        
        params = dict()
        for key in pathwayAnalysisWorkflow['steps'].keys():
            params[key] = argDictionary
        
        if argDictionary['species'] == "Rat":
            network=getLibraryToolDataID(history=history,history_client=history_client,library_client=library_client,name="ratStringNetwork")
            geneLengths=getLibraryToolDataID(history=history,history_client=history_client,library_client=library_client,name="ratGeneLengths")
            homology=getLibraryToolDataID(history=history,history_client=history_client,library_client=library_client,name="HOM_AllOrganism.rpt")
        if argDictionary['species'] == "Cow":
            network=getLibraryToolDataID(history=history,history_client=history_client,library_client=library_client,name="cowStringNetwork")
            geneLengths=getLibraryToolDataID(history=history,history_client=history_client,library_client=library_client,name="cowGeneLengths")
            homology=getLibraryToolDataID(history=history,history_client=history_client,library_client=library_client,name="HOM_AllOrganism.rpt")
        if argDictionary['species'] == "Horse":
            network=getLibraryToolDataID(history=history,history_client=history_client,library_client=library_client,name="horseStringNetwork")
            geneLengths=getLibraryToolDataID(history=history,history_client=history_client,library_client=library_client,name="horseGeneLengths")
            homology=getLibraryToolDataID(history=history,history_client=history_client,library_client=library_client,name="Homology.horse.txt")
        if argDictionary['species'] == "Pig":
            network=getLibraryToolDataID(history=history,history_client=history_client,library_client=library_client,name="pigStringNetwork.txt")
            geneLengths=getLibraryToolDataID(history=history,history_client=history_client,library_client=library_client,name="pigGeneLengths.tabular")
            homology=getLibraryToolDataID(history=history,history_client=history_client,library_client=library_client,name="Homology.pig.txt")
        if argDictionary['species'] == "Zebrafish":
            network=getLibraryToolDataID(history=history,history_client=history_client,library_client=library_client,name="zebrafishStringNetwork")
            geneLengths=getLibraryToolDataID(history=history,history_client=history_client,library_client=library_client,name="zebrafishGeneLengths")
            homology=getLibraryToolDataID(history=history,history_client=history_client,library_client=library_client,name="HOM_AllOrganism.rpt")
        
                
        pathwayDatamap = {'3' : {'id': homology, 'src': 'hda'},'2' : {'id': network, 'src': 'hda'},'1' : {'id': geneLengths, 'src': 'hda'}}

        diffExpDataCollection = getDatasetsByName('cutTable.tabular', history, history_client)
        for index, diffExpData in enumerate(diffExpDataCollection):
            
            numCompleted = getNumberComplete(history['id'], history_client) + 10
            print(numCompleted)
            
            pathwayDatamap["0"] = {'id': diffExpData['id'], 'src': 'hda'}
            workflow_client.invoke_workflow(pathwayAnalysisWorkflow['id'], 
                                            inputs = pathwayDatamap, 
                                            history_id = history['id'], 
                                            params = params)                  
            
            
            comparisonDict = getRowFromCsv(comparisons, index)
            
            if 'Factor1' in comparisonDict.keys():
                comparisonDict['Factor'] = comparisonDict['Factor1'] + "." + comparisonDict['Factor2']
                
            return_dict = {'accessionNo':argDictionary['accessionNumber'],
                           'factor':comparisonDict['Factor'],
                           'comparisonNum':comparisonDict['Numerator'],
                           'comparisonDenom':comparisonDict['Denominator'],
                           'foldChange': getUrl(diffExpData['id']),
                           'interactome': pathwayDatamap['0']['id'],
                           'exonLength': pathwayDatamap['2']['id']}
            
            while getNumberComplete(history['id'], history_client) < numCompleted:
                time.sleep(10)
    
            return_dict['moduleNodes'] = getUrl(getMostRecentDatasetByName('moduleNodes.text', 
                history, history_client)['id'])
            return_dict['modulePlots'] = getUrl(getMostRecentDatasetByName('modulePlots.pdf',
                history, history_client)['id'])
            return_dict['slimEnrichPathways'] = getUrl(getMostRecentDatasetByName('slimEnrichmentPathways.tabular',
                history, history_client)['id'])
            return_dict['enrichedDrugsReverse'] = getUrl(getMostRecentDatasetByName('enrichedDrugsReverse.tabular',
                history, history_client)['id'])
            return_dict['enrichedDrugsMimic'] = getUrl(getMostRecentDatasetByName('enrichedDrugsMimic.tabular',
                history, history_client)['id'])
            return_dict['enrichedTerms'] = getUrl(getMostRecentDatasetByName('enrichedTerms.tabular',
                history, history_client)['id'])
            return_dict['enrichedTerms.reduced'] = getUrl(getMostRecentDatasetByName('enrichedTerms.reduced.tabular',
                history, history_client)['id'])
            return_dict['GO.MDS'] = getUrl(getMostRecentDatasetByName('GO.MDS.html',
                history, history_client)['id'])
            return_collection.append(return_dict)
       
        # Hard code keys to define the order
        keys = ['accessionNo','multiQC','factor','PCA','chrDirTable','comparisonNum','comparisonDenom','foldChange',
        'interactome','exonLength','moduleNodes','modulePlots',
        'slimEnrichPathways','secretedProteins','enrichedDrugsReverse','enrichedDrugsMimic','enrichedTerms','enrichedTerms.reduced','GO.MDS']
        
        outFileName = 'output/' +  argDictionary['accessionNumber'] + '-workflowOutput.tsv'
        
        with open(outFileName, 'wb') as csvFile:
            # Get headers from last dictionary in collection as first doesn't contain all keys
            csvOutput = csv.DictWriter(csvFile, keys, delimiter = "\t")
            csvOutput.writeheader()
            csvOutput.writerows(return_collection)
            
        #tool_client.upload_file(outFileName, history['id'], file_type='tsv')
        
        return return_collection
    else:  
        pathwayAnalysisWorkflow = workflow_client.show_workflow('e85a3be143d5905b')
        
        params = dict()
        for key in pathwayAnalysisWorkflow['steps'].keys():
            params[key] = argDictionary
            
       
        if argDictionary['species'] == "Mouse":  
        
            network=getLibraryToolDataID(history=history,history_client=history_client,library_client=library_client,name="mouseStringNetwork")
            geneLengths=getLibraryToolDataID(history=history,history_client=history_client,library_client=library_client,name="MouseGeneLengths.tab")
            homology=getLibraryToolDataID(history=history,history_client=history_client,library_client=library_client,name="Homology.mouse.txt")
            secretedReference=getLibraryToolDataID(history=history,history_client=history_client,library_client=library_client,name="uniprot-secreted-mouse.txt")
            
            pathwayDatamap = {'4' : {'id':  secretedReference, 'src': 'hda'},'3' : {'id': homology, 'src': 'hda'},'2' : {'id': network, 'src': 'hda'},'1' : {'id': geneLengths, 'src': 'hda'}}
        else:
        
            network=getLibraryToolDataID(history=history,history_client=history_client,library_client=library_client,name="humanStringNetwork")
            geneLengths=getLibraryToolDataID(history=history,history_client=history_client,library_client=library_client,name="geneLengths")
            homology=getLibraryToolDataID(history=history,history_client=history_client,library_client=library_client,name="Homology.mouse.txt")
            secretedReference=getLibraryToolDataID(history=history,history_client=history_client,library_client=library_client,name="uniprot-secreted-human.txt")
            pathwayDatamap = {'4' : {'id':  secretedReference, 'src': 'hda'},'3' : {'id': homology, 'src': 'hda'},'2' : {'id': network, 'src': 'hda'},'1' : {'id': geneLengths, 'src': 'hda'}}
    
        diffExpDataCollection = getDatasetsByName('cutTable.tabular', history, history_client)
        for index, diffExpData in enumerate(diffExpDataCollection):
            
            numCompleted = getNumberComplete(history['id'], history_client) + 14
            print(numCompleted)
            
            pathwayDatamap["0"] = {'id': diffExpData['id'], 'src': 'hda'}

    
        
            #pathwayDatamap['1'] = {'id': diffExpData['id'], 'src': 'hda'}
            workflow_client.invoke_workflow(pathwayAnalysisWorkflow['id'], 
                                            inputs = pathwayDatamap, 
                                            history_id = history['id'], 
                                            params = params)
            comparisonDict = getRowFromCsv(comparisons, index)
            
            if 'Factor1' in comparisonDict.keys():
                comparisonDict['Factor'] = comparisonDict['Factor1'] + "." + comparisonDict['Factor2']
                
            return_dict = {'accessionNo':argDictionary['accessionNumber'],
                           'factor':comparisonDict['Factor'],
                           'comparisonNum':comparisonDict['Numerator'],
                           'comparisonDenom':comparisonDict['Denominator'],
                           'foldChange': getUrl(diffExpData['id']),
                           'interactome': pathwayDatamap['0']['id'],
                           'exonLength': pathwayDatamap['2']['id']}
            
            while getNumberComplete(history['id'], history_client) < numCompleted:
                time.sleep(10)
    
            return_dict['moduleNodes'] = getUrl(getMostRecentDatasetByName('moduleNodes.text', 
                history, history_client)['id'])
            return_dict['modulePlots'] = getUrl(getMostRecentDatasetByName('modulePlots.pdf',
                history, history_client)['id'])
            return_dict['pathways'] = getUrl(getMostRecentDatasetByName('pathways.tabular', 
                history, history_client)['id'])
            return_dict['enrichPlot'] = getUrl(getMostRecentDatasetByName('enrichmentPlot.png', 
                history, history_client)['id'])
            return_dict['enrichmentTable'] = getUrl(getMostRecentDatasetByName('TF_EnrichmentTable.tabular', 
                history, history_client)['id'])
            return_dict['slimEnrichPathways'] = getUrl(getMostRecentDatasetByName('slimEnrichmentPathways.tabular',
                history, history_client)['id'])
            return_dict['secretedProteins'] = getUrl(getMostRecentDatasetByName('secretedProteins.tabular',
                history, history_client)['id'])
            return_dict['enrichedDrugsReverse'] = getUrl(getMostRecentDatasetByName('enrichedDrugsReverse.tabular',
                history, history_client)['id'])
            return_dict['enrichedDrugsMimic'] = getUrl(getMostRecentDatasetByName('enrichedDrugsMimic.tabular',
                history, history_client)['id'])
            return_dict['enrichedTerms'] = getUrl(getMostRecentDatasetByName('enrichedTerms.tabular',
                history, history_client)['id'])
            return_dict['enrichedTerms.reduced'] = getUrl(getMostRecentDatasetByName('enrichedTerms.reduced.tabular',
                history, history_client)['id'])
            return_dict['GO.MDS'] = getUrl(getMostRecentDatasetByName('GO.MDS.html',
                history, history_client)['id'])
            return_collection.append(return_dict)
       
        # Hard code keys to define the order
        keys = ['accessionNo','multiQC','factor','PCA','chrDirTable','comparisonNum','comparisonDenom','foldChange',
        'interactome','exonLength','moduleNodes','modulePlots','pathways','enrichPlot', 'enrichmentTable',
        'slimEnrichPathways','secretedProteins','enrichedDrugsReverse','enrichedDrugsMimic','enrichedTerms','enrichedTerms.reduced','GO.MDS']
        
        outFileName = 'output/' +  argDictionary['accessionNumber'] + '-workflowOutput.tsv'
        
        with open(outFileName, 'wb') as csvFile:
            # Get headers from last dictionary in collection as first doesn't contain all keys
            csvOutput = csv.DictWriter(csvFile, keys, delimiter = "\t")
            csvOutput.writeheader()
            csvOutput.writerows(return_collection)
            
        
        return return_collection
Beispiel #18
0
#!/usr/bin/env python
import os
import shutil

import galaxy_ie_helpers

from bioblend.galaxy.histories import HistoryClient

hid = os.environ.get('DATASET_HID', None)
history_id = os.environ['HISTORY_ID']
if hid not in ('None', None):
    galaxy_ie_helpers.get(int(hid))
    shutil.copy('/import/%s' % hid, '/import/ipython_galaxy_notebook.ipynb')

additional_ids = os.environ.get("ADDITIONAL_IDS", "")
if additional_ids:
    gi = galaxy_ie_helpers.get_galaxy_connection(history_id=history_id,
                                                 obj=False)
    hc = HistoryClient(gi)
    history = hc.show_history(history_id, contents=True)
    additional_ids = additional_ids.split(",")
    for hda in history:
        if hda["id"] in additional_ids:
            galaxy_ie_helpers.get(int(hda["hid"]))
def main():
    parser = OptionParser()
    parser.add_option("-A",
                      "--auth-file",
                      dest="auth_filename",
                      help="JSON file with Galaxy host and key",
                      metavar="FILE")
    parser.add_option(
        "-f",
        "--uuid-file",
        dest="uuids_filename",
        help=
        "TSV file with list of UUIDs to import. The first row is assumed to be a header",
        metavar="FILE")
    parser.add_option(
        "-H",
        "--target-history",
        dest="target_history",
        help="Target history name in Galaxy to copy datasets into",
        metavar="HISTORY_NAME")
    (options, args) = parser.parse_args()
    if (not options.auth_filename):
        print_error_and_exit('Authentication file not provided')
    #if(not options.uuids_filename):
    #print_error_and_exit('TSV file with UUIDs not provided');
    if (not options.target_history):
        print_error_and_exit(
            'Galaxy history name where datasets will be imported not provided')

    #Read authentication info
    galaxy_host, galaxy_key = parse_auth_file(options.auth_filename)

    gi = GalaxyInstance(url=galaxy_host, key=galaxy_key)
    history_client = HistoryClient(gi)
    library_client = LibraryClient(gi)
    folder_client = FoldersClient(gi)

    #Read UUIDs file
    if (options.uuids_filename):
        try:
            uuids_fd = open(options.uuids_filename, 'rb')
        except IOError:
            print_error_and_exit('Could not open TSV file with UUIDs ' +
                                 options.uuids_filename)
    else:
        uuids_fd = sys.stdin
    queried_ds_uuid_dict = parse_TSV_file(uuids_fd)

    #Search for datasets
    find_datasets_by_uuids_in_histories(gi, history_client,
                                        queried_ds_uuid_dict)
    find_datasets_by_uuids_in_libraries(gi, library_client,
                                        queried_ds_uuid_dict)

    dataset_info_list = queried_ds_uuid_dict.values()
    #Validate datasets, discard repeats
    validate_queried_dataset_info(dataset_info_list)

    #Get/create target history
    target_history_id = get_or_create_history_id(gi, history_client,
                                                 options.target_history)
    #Copy datasets from library to history
    copy_from_lib(gi,
                  history_client,
                  dataset_info_list,
                  target_history_id=target_history_id)
    #Copy from history to /tmp and back - don't use anymore
    #copy_to_tmp_lib_and_back(gi, library_client, history_client, folder_client, '/tmp', dataset_info_list, target_history_id=target_history_id);
    #Copy history datasets from other histories
    copy_other_history_datasets(gi,
                                history_client,
                                dataset_info_list,
                                target_history_id=target_history_id)
    #Create dataset collections
    create_dataset_collections(gi,
                               history_client,
                               dataset_info_list,
                               target_history_id=target_history_id)
def get_history_status(user, hist_id=None):
    # go through every galaxy instance
    gits = GalaxyInstanceTracking.objects.filter(
        galaxyuser__internal_user=user)

    # loop through instances
    status = []
    for git in gits:
        ## loop through workflows for that instance
        gi, gu = get_gi_gu(user, git)
        hc = HistoryClient(gi)
        hists = hc.get_histories()

        # loop through and create a list of dictionaries for our django table
        for hist in hists:

            sd = {}
            # add useful info
            if hist_id and hist['id'] != hist_id:
                continue

            history_info = hc.show_history(hist['id'])

            # add status info
            sd_bioblend = hc.get_status(hist['id'])
            state_details = sd_bioblend['state_details']
            sd.update(state_details)

            sd['estimated_progress'] = sd_bioblend['percent_complete']
            datetime_object = datetime.strptime(history_info['update_time'],
                                                '%Y-%m-%dT%H:%M:%S.%f')
            sd['update_time'] = datetime_object.strftime('%Y-%m-%d %H:%M:%S')
            sd['update_time_unix'] = unixtime(datetime_object)
            sd['galaxy_instance'] = git.name

            sd['name'] = hist['name']

            hsq = History.objects.filter(galaxy_id=hist['id'],
                                         galaxyinstancetracking=git)

            if hsq:

                hs = hsq[0]
                hs.name = hist['name']
                hs.update_time = datetime_object.strftime('%Y-%m-%d %H:%M:%S')
                hs.empty = state_details['empty']
                hs.error = state_details['error']
                hs.failed_metadata = state_details['failed_metadata']
                hs.new = state_details['new']
                hs.ok = state_details['ok']
                hs.paused = state_details['paused']
                hs.running = state_details['running']
                hs.queued = state_details['queued']
                hs.setting_metadata = state_details['setting_metadata']
                hs.upload = state_details['upload']
                hs.estimated_progress = sd_bioblend['percent_complete']
            else:
                hs = History(
                    galaxyinstancetracking=git,
                    name=hist['name'],
                    update_time=datetime_object.strftime('%Y-%m-%d %H:%M:%S'),
                    empty=state_details['empty'],
                    error=state_details['error'],
                    failed_metadata=state_details['failed_metadata'],
                    new=state_details['new'],
                    ok=state_details['ok'],
                    paused=state_details['paused'],
                    running=state_details['running'],
                    queued=state_details['queued'],
                    setting_metadata=state_details['setting_metadata'],
                    upload=state_details['upload'],
                    galaxy_id=hist['id'],
                    estimated_progress=sd_bioblend['percent_complete'])

            hs.save()
            sd['history_data_bioblend_list'] = '/galaxy/history_data_bioblend_list/{}'.format(
                hs.pk)
            status.append(sd)

    status = sorted(status, key=lambda k: k['update_time_unix'], reverse=True)

    return status
def get(datasets_identifiers,
        identifier_type='hid',
        history_id=None,
        retrieve_datatype=None):
    """
        Given the history_id that is displayed to the user, this function will
        either search for matching files in the history if the identifier_type
        is set to 'regex', otherwise it will directly download the file[s] from
        the history and stores them under /import/.
        Return value[s] are the path[s] to the dataset[s] stored under /import/
    """
    history_id = history_id or os.environ['HISTORY_ID']
    # The object version of bioblend is to slow in retrieving all datasets from a history
    # fallback to the non-object path
    gi = get_galaxy_connection(history_id=history_id, obj=False)
    file_path_all = []
    datatypes_all = []

    if type(datasets_identifiers) is not list:
        datasets_identifiers = [datasets_identifiers]

    if identifier_type == "regex":
        datasets_identifiers = find_matching_history_ids(datasets_identifiers)
        identifier_type = "hid"

    for dataset_id in datasets_identifiers:
        file_path = '/import/%s' % dataset_id
        log.debug('Downloading gx=%s history=%s dataset=%s', gi, history_id,
                  dataset_id)
        # Cache the file requests. E.g. in the example of someone doing something
        # silly like a get() for a Galaxy file in a for-loop, wouldn't want to
        # re-download every time and add that overhead.
        if not os.path.exists(file_path):
            hc = HistoryClient(gi)
            dc = DatasetClient(gi)
            history = hc.show_history(history_id, contents=True)
            datasets = {ds[identifier_type]: ds['id'] for ds in history}
            if retrieve_datatype:
                datatypes_all.append(
                    {ds[identifier_type]: ds['extension']
                     for ds in history})
            if identifier_type == 'hid':
                dataset_id = int(dataset_id)
            dc.download_dataset(datasets[dataset_id],
                                file_path=file_path,
                                use_default_filename=False)
        else:
            hc = HistoryClient(gi)
            dc = DatasetClient(gi)
            history = hc.show_history(history_id, contents=True)
            datatypes_all.append(
                {ds[identifier_type]: ds['extension']
                 for ds in history})
            log.debug('Cached, not re-downloading')

        file_path_all.append(file_path)

    ## First path if only one item given, otherwise all paths.
    ## Should not break compatibility.
    if retrieve_datatype:
        if len(file_path_all) == 1:
            dataset_number = int(file_path_all[0].strip().split("/")[-1])
            return file_path_all, datatypes_all[0][dataset_number]
        else:
            datatype_multi = dict()
            for i in file_path_all:
                dataset_number = int(i.strip().split("/")[-1])
                datatype_multi[dataset_number] = datatypes_all[0][
                    dataset_number]
            return file_path_all, datatype_multi
    else:
        return file_path_all[0] if len(file_path_all) == 1 else file_path_all