Python SpreadSheet Examples, scilifelab.google.google_docs.SpreadSheet Python Examples

Example #1

0

Show file

File: gdocs_report.py Project: wenjingk/scilifelab

def upload_to_gdocs(fcdir, credentials_file=None, gdocs_folder=None):

    output_data = {
        'stdout': StringIO(),
        'stderr': StringIO(),
        'debug': StringIO()
    }

    if not os.path.exists(fcdir):
        LOG.error("The run folder, {} does not exist!".format(
            os.path.basename(fcdir)))
        return output_data

    credentials = google.get_credentials(credentials_file)
    if credentials is None:
        LOG.error("Could not parse the Google Docs credentials")
        return output_data

    metrics = collect_metrics(fcdir)
    samples = _format_samples(metrics)

    ssheet_name = _demultiplex_spreadsheet(metrics['RunInfo'].get(
        'Date', None))
    ssheet = SpreadSheet(credentials, ssheet_name)
    ssheet.move_to_folder(gdocs_folder)

    run_id = metrics['RunInfo']['Id'].split("_")
    wsheet_name = "_".join([run_id[0], run_id[-1]])

    # Write the metrics for the entire flowcell
    write_flowcell_metrics(samples, ssheet, wsheet_name)

    # Write project-centered metrics
    projects = list(set([sample.get('Project name', '')
                         for sample in samples]))
    for project in projects:
        if project in ['Undetermined_indices', '']:
            continue
        project_samples = [
            sample for sample in samples
            if sample.get('Project name', '') == project
        ]
        # Insert the run name as description
        for sample in project_samples:
            sample['Description'] = wsheet_name

        ssheet_name = "{}_sequencing_results".format(project)
        ssheet = SpreadSheet(credentials, ssheet_name)
        ssheet.move_to_folder(gdocs_folder)
        # Truncate the summary worksheet so that it won't show the wrong information in case upload fails
        write_flowcell_metrics([], ssheet, "Summary")
        write_flowcell_metrics(project_samples, ssheet, wsheet_name)

        # Create the summary over all worksheets in the project
        summary_samples = summarize_project(ssheet)
        write_flowcell_metrics(summary_samples, ssheet, "Summary")

    return output_data

Example #2

0

Show file

File: gdocs_updater.py Project: mayabrandi/hugin

 def __init__(self, config):
     super(GDocsUpdater, self).__init__(config)
         
     # Connect to the Google Docs api
     gdconf = self.config.get("gdocs",{})
     creds = os.path.expanduser(gdconf.get("credentials_file",""))
     assert os.path.exists(creds), "Supplied GDocs credentials file does not exist"
     self.gdcon = SpreadSheet(get_credentials(creds))
     assert self.gdcon, "Could not get a SpreadSheet object, please verify gdocs credentials"
     doc = gdconf.get("qc_checklist",None)
     assert doc, "No QC checklist specified in configuration, please specify"
     ssheet = self.gdcon.get_spreadsheet(doc)
     assert ssheet, "Could not locate QC checklist '{}' on Google Docs. Please make sure it exists".format(doc)
     self.gdcon.ssheet = ssheet
     
     # Get the Ongoing, Finished and Coming worksheets
     self.ongoing = self.gdcon.get_worksheet("Ongoing")
     self.coming = self.gdcon.get_worksheet("Coming")
     self.finished = self.gdcon.get_worksheet("Finished")
     assert self.ongoing and self.coming and self.finished, "Could not get 'Ongoing', 'Finished' and 'Coming' worksheets from '{}'. Please make sure that they exist".format(doc)
     
     # Get a connection to the StatusDB project database
     dbconf = self.config.get("statusdb",{})
     try:
         self.pcon = ProjectSummaryConnection(url=dbconf.get("url","localhost"), 
                                              username=dbconf.get("user","user"), 
                                              password=dbconf.get("password","pass"))
     except ConnectionError:
         self.pcon = None

Example #3

0

Show file

File: load_status_from_google_docs.py Project: Galithil/scilifelab

def get_20158_info(credentials, project_name_swe):
    versions = {"01": ['Sample name Scilife', "Total reads per sample", "Sheet1","Passed=P/ not passed=NP*"],
            "02": ["Sample name (SciLifeLab)", "Total number of reads (Millions)","Sheet1",
              "Based on total number of reads after mapping and duplicate removal"],
            "03": ["Sample name (SciLifeLab)", "Total number of reads (Millions)","Sheet1",
              "Based on total number of reads after mapping and duplicate removal "],
            "05": ["Sample name (from Project read counts)", "Total number","Sheet1",
              "Based on total number of reads","Based on total number of reads after mapping and duplicate removal"],
            "06": ["Sample name (from Project read counts)", "Total number","Sheet1",
              "Based on total number of reads","Based on total number of reads after mapping and duplicate removal"]}
    info = {}
    client = SpreadSheet(credentials)
    feed = client.get_spreadsheets_feed(project_name_swe + '_20158', False)
    if len(feed.entry) != 0:
        ssheet = feed.entry[0].title.text
        version = ssheet.split(str('_20158_'))[1].split(' ')[0].split('_')[0]
        client = SpreadSheet(credentials, ssheet)
        content, ws_key, ss_key = get_google_document(ssheet,  versions[version][2], client)
        dummy, P_NP_colindex = get_column(content, versions[version][3])
        dummy, No_reads_sequenced_colindex = get_column(content, versions[version][1])
        row_ind, scilife_names_colindex = get_column(content, versions[version][0])
        if (version=="05")| (version=="06"):
            dummy, P_NP_duprem_colindex = get_column(content, versions[version][4]) ## [version][4] for dup rem
        else:
            P_NP_duprem_colindex=''
        for j, row in enumerate(content):
            if (j > row_ind):
                try:
                    sci_name = str(row[scilife_names_colindex]).strip()
                    striped_name = strip_index(sci_name)
                    no_reads = str(row[No_reads_sequenced_colindex]).strip()
                    if (P_NP_duprem_colindex!='') and (str(row[P_NP_duprem_colindex]).strip()!=''):
                        status = str(row[P_NP_duprem_colindex]).strip()
                    else:
                        status = str(row[P_NP_colindex]).strip()
                    info[striped_name] = [status,no_reads]
                except:
                    pass
    else:
        info=None
    return info

Example #4

0

Show file

File: best_practice_analysis_upload.py Project: senthil10/scilifelab

def main(project_name, conf, cred):
    credentials = get_credentials(cred)
    client = SpreadSheet(credentials)
    config = cl.load_config(conf)
    couch = load_couch_server(conf)
    analysis_db = couch['analysis']
    #proj_db = couch['projects']
    BP_RNA = DB.BP_RNA(project_name)
    key = find_proj_from_view(analysis_db, project_name)
    BP_RNA.obj['_id'] = find_or_make_key(key)
    info = save_couchdb_obj(analysis_db, BP_RNA.obj)
    LOG.info('project %s %s : _id = %s' % (project_name, info, BP_RNA.obj['_id']))

Example #5

0

Show file

File: gdocs_report.py Project: Honglongwu/scilifelab

def upload_to_gdocs(log, fcdir, credentials_file=None, gdocs_folder=None):

    output_data = {'stdout':StringIO(), 'stderr':StringIO(), 'debug':StringIO()}

    if not os.path.exists(fcdir):
        log.error("The run folder, {} does not exist!".format(os.path.basename(fcdir)))
        return output_data

    credentials = google.get_credentials(credentials_file)
    if credentials is None:
        log.error("Could not parse the Google Docs credentials")
        return output_data

    metrics = collect_metrics(fcdir, log)
    samples = _format_samples(metrics)

    ssheet_name = _demultiplex_spreadsheet(metrics['RunInfo'].get('Date',None))
    ssheet = SpreadSheet(credentials,ssheet_name)
    if ssheet.new_doc:
        ssheet.move_to_folder(gdocs_folder)

    run_id = metrics['RunInfo']['Id'].split("_")
    wsheet_name = "_".join([run_id[0],run_id[-1]])

    # Write the metrics for the entire flowcell
    write_flowcell_metrics(samples, ssheet, wsheet_name)

    # Write project-centered metrics
    projects = list(set([sample.get('Project name','') for sample in samples]))
    for project in projects:
        if project in ['Undetermined_indices','']:
            continue
        project_samples = [sample for sample in samples if sample.get('Project name','') == project]
        # Insert the run name as description
        for sample in project_samples:
            sample['Description'] = wsheet_name

        ssheet_name = "{}_sequencing_results".format(project)
        ssheet = SpreadSheet(credentials,ssheet_name)
        if ssheet.new_doc:
            ssheet.move_to_folder(gdocs_folder)
        # Truncate the summary worksheet so that it won't show the wrong information in case upload fails
        write_flowcell_metrics([], ssheet, "Summary")
        project_samples = summarize_project(log, ssheet,{wsheet_name: project_samples})
        write_flowcell_metrics(project_samples, ssheet, wsheet_name)

        # Create the summary over all worksheets in the project
        summary_samples = summarize_project(log, ssheet)
        write_flowcell_metrics(summary_samples, ssheet, "Summary")

    return output_data

Example #6

0

Show file

    def __init__(self, project_name, config):
        """Initialize the object"""

        # Map internal attribute names to the GPL column headers
        col_mapping = self.column_mapping()
        for attr in col_mapping.keys():
            setattr(self, attr, None)

        # Get the name of the spreadsheet where uppnex ids can be found
        gdocs_config = config.get("gdocs", config.get("gdocs_upload",{}))
        cred_file = gdocs_config.get("credentials_file",gdocs_config.get("gdocs_credentials"))
        ssheet_title = gdocs_config.get("projects_spreadsheet")
        wsheet_title = gdocs_config.get("projects_worksheet")

        # Get the credentials
        credentials = get_credentials(cred_file)
        assert credentials is not None, \
        "The Google Docs credentials could not be found."
        assert ssheet_title is not None and wsheet_title is not None, \
            "The names of the projects spreadsheet and worksheet on Google \
            Docs could not be found."

        # Connect to the spread- and worksheet
        ssheet = SpreadSheet(credentials, ssheet_title)
        assert ssheet is not None, \
            "Could not fetch '{}' from Google Docs.".format(ssheet_title)

        # We allow multiple, comma-separated worksheets to be searched
        for wtitle in wsheet_title.split(','):
            wsheet = ssheet.get_worksheet(wtitle.strip())
            if not wsheet:
                print("WARNING: Could not locate {} in {}".format(wsheet_title, ssheet_title))
                continue

            # Get the rows for the project
            rows = ssheet.get_cell_content(wsheet)
            header = ssheet.get_header(wsheet)
            column_indexes = {attr: ssheet.get_column_index(wsheet,col)-1 for attr, col in col_mapping.items()}
            for row in rows:
                # skip if this is not the project we're interested in
                if row[column_indexes["project_name"]] != project_name:
                    continue
                
                # Will only use the first result found to set each attribute
                for attr, index in column_indexes.items():
                    setattr(self, attr, row[index])

                # We have found the project data so stop iterating
                return

Example #7

0

Show file

File: load_status_from_google_docs.py Project: htnani/scilifelab

def get_20158_info(credentials, project_name_swe):
    versions = {
        "01": [
            'Sample name Scilife', "Total reads per sample", "Sheet1",
            "Passed=P/ not passed=NP*"
        ],
        "02": [
            "Sample name (SciLifeLab)", "Total number of reads (Millions)",
            "Sheet1",
            "Based on total number of reads after mapping and duplicate removal"
        ],
        "03": [
            "Sample name (SciLifeLab)", "Total number of reads (Millions)",
            "Sheet1",
            "Based on total number of reads after mapping and duplicate removal "
        ],
        "05": [
            "Sample name (from Project read counts)", "Total number", "Sheet1",
            "Based on total number of reads",
            "Based on total number of reads after mapping and duplicate removal"
        ],
        "06": [
            "Sample name (from Project read counts)", "Total number", "Sheet1",
            "Based on total number of reads",
            "Based on total number of reads after mapping and duplicate removal"
        ]
    }
    info = {}
    client = SpreadSheet(credentials)
    feed = client.get_spreadsheets_feed(project_name_swe + '_20158', False)
    if len(feed.entry) != 0:
        ssheet = feed.entry[0].title.text
        version = ssheet.split(str('_20158_'))[1].split(' ')[0].split('_')[0]
        client = SpreadSheet(credentials, ssheet)
        content, ws_key, ss_key = get_google_document(ssheet,
                                                      versions[version][2],
                                                      client)
        dummy, P_NP_colindex = get_column(content, versions[version][3])
        dummy, No_reads_sequenced_colindex = get_column(
            content, versions[version][1])
        row_ind, scilife_names_colindex = get_column(content,
                                                     versions[version][0])
        if (version == "05") | (version == "06"):
            dummy, P_NP_duprem_colindex = get_column(
                content, versions[version][4])  ## [version][4] for dup rem
        else:
            P_NP_duprem_colindex = ''
        for j, row in enumerate(content):
            if (j > row_ind):
                try:
                    sci_name = str(row[scilife_names_colindex]).strip()
                    striped_name = strip_index(sci_name)
                    no_reads = str(row[No_reads_sequenced_colindex]).strip()
                    if (P_NP_duprem_colindex != '') and (str(
                            row[P_NP_duprem_colindex]).strip() != ''):
                        status = str(row[P_NP_duprem_colindex]).strip()
                    else:
                        status = str(row[P_NP_colindex]).strip()
                    info[striped_name] = [status, no_reads]
                except:
                    pass
    else:
        info = None
    return info

Example #8

0

Show file

File: flag_finished_project_GPL.py Project: Galithil/scilifelab

	for sample in sample_dir:
		flag = os.path.join(p_path,sample,'FINISHED_AND_DELIVERED')
		if not os.path.exists(flag):
			sam_cnt = sam_cnt+1
			sam_nm.append(sample)
	if sam_cnt == 0:
		status = 'full_marked'
	elif sam_cnt == len(sample_dir):
		status = 'none_marked'
	else:
		status = 'some_marked'
	return status,sam_nm;

## getting the spreadsheet from given name and appropriate worksheet
credentials = get_credentials(config['credentials_file'])
ssheet = SpreadSheet(credentials,config['gpl_spreadsheet'])
wksheet = ssheet.get_worksheet(config['gpl_worksheet'])
cell_feed = ssheet.get_cell_feed(wksheet)

print "**** DONE ****\n" ##log
print "Parsing worksheet and obtaining information..." ##log

## iterating through cell's content and gives list of prject signed delivered
col_tot = int(cell_feed.col_count.text)
projects_done, cell_content, row, ck, rNum = ([], [], [], 0, 1)
for cell in cell_feed.entry:
	row.append(cell.content.text or "")
	ck = ck+1
	if ck == col_tot:
		if rNum == 1:
			head1 = row

Example #9

0

Show file

File: gdocs_updater.py Project: mayabrandi/hugin

class GDocsUpdater(rm.RunMonitor):
    
    def __init__(self, config):
        super(GDocsUpdater, self).__init__(config)
            
        # Connect to the Google Docs api
        gdconf = self.config.get("gdocs",{})
        creds = os.path.expanduser(gdconf.get("credentials_file",""))
        assert os.path.exists(creds), "Supplied GDocs credentials file does not exist"
        self.gdcon = SpreadSheet(get_credentials(creds))
        assert self.gdcon, "Could not get a SpreadSheet object, please verify gdocs credentials"
        doc = gdconf.get("qc_checklist",None)
        assert doc, "No QC checklist specified in configuration, please specify"
        ssheet = self.gdcon.get_spreadsheet(doc)
        assert ssheet, "Could not locate QC checklist '{}' on Google Docs. Please make sure it exists".format(doc)
        self.gdcon.ssheet = ssheet
        
        # Get the Ongoing, Finished and Coming worksheets
        self.ongoing = self.gdcon.get_worksheet("Ongoing")
        self.coming = self.gdcon.get_worksheet("Coming")
        self.finished = self.gdcon.get_worksheet("Finished")
        assert self.ongoing and self.coming and self.finished, "Could not get 'Ongoing', 'Finished' and 'Coming' worksheets from '{}'. Please make sure that they exist".format(doc)
        
        # Get a connection to the StatusDB project database
        dbconf = self.config.get("statusdb",{})
        try:
            self.pcon = ProjectSummaryConnection(url=dbconf.get("url","localhost"), 
                                                 username=dbconf.get("user","user"), 
                                                 password=dbconf.get("password","pass"))
        except ConnectionError:
            self.pcon = None
        
        
        
    def _list_runs(self, lists):
        # Loop over the lists and fetch the cards
        runs = {}
        for tlist in lists:
            list_obj = self.trello.get_list(self.trello_board,tlist,True)
            if not list_obj:
                continue
            
            # Loop over the cards in the list
            for card in list_obj.list_cards():
                # Get the description and convert it to a dictionary
                runs[card.name] = self.description_to_dict(card.description)
                
        return runs
    
    def coming_runs(self):
        """Return a dictionary with runs that are currently in process, i.e. not handed over to 
        the processing pipeline on Uppmax. The key in the dictionary is the run id and the values
        is a metadata dictionary
        """
        
        # Runs in these lists are to be considered "coming"
        lists = [rm.FIRSTREAD,
                 rm.INDEXREAD,
                 rm.SECONDREAD,
                 rm.PROCESSING,
                 rm.UPPMAX,
                 rm.STALLED]
        return self._list_runs(lists)
        
        
    def ongoing_runs(self):
        """Return a dictionary with runs that have finished and have been handed over to 
        the processing pipeline on Uppmax. The key in the dictionary is the run id and the values
        is a metadata dictionary
        """
        
        # Runs in these lists are to be considered "coming"
        lists = [rm.COMPLETED]
        return self._list_runs(lists)
        
    def reshape_run_info(self, runs, skiplist=[]):
        """Take the dictionary of runs and convert to a sorted list of lists with elements 
        corresponding to the columns in the checklist"""
        
        run_projects = []
        for id,data in runs.items():
            p = data.get('Projects',[''])
            if type(p) is not list:
                p = [p]
            for project in p:
                if len(project) == 0:
                    project = 'Unknown, please check!'
                if "{}_{}".format(id,project) not in skiplist:
                    application, tp = '',''#self.lookup_project(project)
                    run_projects.append([id,project,application,tp,'',data.get('Run mode',[''])[0]])

        return run_projects
        
    def lookup_project(self, project):
        """Lookup project application and type in StatusDB"""
        
        application = ""
        type = ""
        if self.pcon:
            pdoc = self.pcon.get_entry(project)
            if pdoc:
                application = str(pdoc.get("application",""))
                type = str(pdoc.get("type",pdoc.get("details",{}).get("type","")))
                
        return application, type
    
    def get_skiplist(self):
        """Get the runs and projects already listed in the GDocs spreadsheet
        """
        
        skiplist = []
        # Get the contents from the finished worksheet
        for run_project in self.gdocs_finished_runs():
            skiplist.append("{}_{}".format(run_project[0],run_project[1]))
    
        return skiplist
    
    def gdocs_coming_runs(self):
        return self._get_gdocs_run_projects(self.coming,COMING_HEADER_OFFSET)
    def gdocs_ongoing_runs(self):
        return self._get_gdocs_run_projects(self.ongoing,ONGOING_HEADER_OFFSET)
    def gdocs_finished_runs(self):
        return self._get_gdocs_run_projects(self.finished,FINISHED_HEADER_OFFSET)
    
    def _get_gdocs_run_projects(self, wsheet, header_offset):
        
        # Get the cell data
        run_projects = {}
        rows = self.gdcon.get_cell_content(wsheet,header_offset,1,0,6)
        for row in rows:
            if len(str(row[0])) == 0:
                continue
            data = [str(r) for r in row]
            key = "{}{}".format(data[0],data[1])
            if key in run_projects:
                continue
            run_projects[key] = data
        
        # Only return unique rows
        return run_projects.values()
        
    def update_gdocs(self):
        
        # Get the coming runs from Trello but Exclude runs that are already in gdocs
        gdocs_finished = self.gdocs_finished_runs()
        gdocs_ongoing = self.gdocs_ongoing_runs()
        gdocs_coming = self.gdocs_coming_runs()
        trello_coming = self.reshape_run_info(self.coming_runs(), ["{}_{}".format(r[0],r[1]) for r in gdocs_finished + gdocs_ongoing + gdocs_coming])
        # Get the ongoing runs from Trello but exclude runs that are already in the finished or ongoing tab
        trello_ongoing = self.reshape_run_info(self.ongoing_runs(), ["{}_{}".format(r[0],r[1]) for r in gdocs_finished + gdocs_ongoing])
        
        # Add each coming run to the next empty row
        for run in trello_coming:
            self.update_empty_row(self.coming,run,COMING_HEADER_OFFSET)
        
        # Move each run from coming if it exists there to the ongoing tab or just add it
        for run in trello_ongoing:
            status = self.run_project_match(run,gdocs_coming)
            if status == 0:
                self.update_empty_row(self.ongoing,run,ONGOING_HEADER_OFFSET)
                continue
            # Find the row index of the run in the coming tab
            row_index = self.gdcon.get_row_index(self.coming,run[0:2],COMING_HEADER_OFFSET)
            # Get the data from the coming tab, add it to an empty row in the ongoing tab and replace it with empty values
            row_data = self.gdcon.get_cell_content(self.coming,row_index,0,row_index,0)
            self.update_empty_row(self.ongoing,row_data[0],ONGOING_HEADER_OFFSET)
            self.gdcon.update_row(self.coming,row_index,["" for i in xrange(len(row_data[0]))])
    
        def last_name(data):
            pcs = data[1].split('.')
            if len(pcs) == 1:
                return pcs[0]
            return "".join(pcs[1:])
    
        # Lastly, update the application and type fields in gdocs if they are empty
        for wsheet, offset in [(self.coming, COMING_HEADER_OFFSET), (self.ongoing, ONGOING_HEADER_OFFSET)]:
            # Print a reader-friendly text to stdout
            print("{}\n{}\n".format(wsheet.title.text,"".join(['-' for i in xrange(len(wsheet.title.text))])))
            for run in sorted(self._get_gdocs_run_projects(wsheet,offset), key=last_name):
                if len(run) < 4:
                    continue
                if run[2] == "" or run[3] == "":
                    app, tp = self.lookup_project(run[1])
                    if run[2] == "":
                        run[2] = app
                    if run[3] == "":
                        run[3] = tp
                row_index = self.gdcon.get_row_index(wsheet,run[0:2],offset)
                self.gdcon.update_row(wsheet,row_index,run[0:4])
                
                print("{} - {}{}".format(run[1],"{} - ".format(run[3]) if len(run[3]) > 0 else "",run[4]))
                print("{}{}\n".format("{}\n".format(run[2]) if len(run[2]) > 0 else "",run[0]))
            
    def update_empty_row(self, wsheet, data, offset, merged=False):
        """Update the next empty row after the specified offset with the supplied data
        """
        updated = False
        # Require two empty rows in succession
        row_index = offset
        r2 = row_index
        while r2-row_index != 1:
            row_index = self.gdcon.get_row_index(wsheet,["" for i in xrange(len(data))],r2)
            # If we're writing a merged row, we need two consecutive empty rows
            if merged:
                r2 = self.gdcon.get_row_index(wsheet,["" for i in xrange(len(data))],row_index+1)
            else:
                r2 = row_index+1
            
        assert row_index > 0, "***ERROR*** No more rows left in spreadsheet"
        updated = self.gdcon.update_row(wsheet,row_index,data)
        # FIXME: do this better.. if the row is merged, write the same data to the second "hidden" row
        if merged:
            self.gdcon.update_row(wsheet,row_index+1,data)
            
        return updated
        
    def run_project_match(self, needle, haystack):
        """Checks if a run and project exist in a list of lists. Determines identity by the two first 
        columns in each list, the third and fourth are checked to determine if they need updating.
        Return 0 for no match, 1 for match that needs updating and 2 for a match that does not need updating
        """
        if len(needle) < 4:
            return 0
        
        for straw in haystack:
            if len(straw) < 4:
                continue
            if needle[0] != straw[0] or needle[1] != straw[1]:
                continue
            if needle[2] != straw[2] or needle[3] != straw[3]:
                return 1
            return 2
        
        return 0