Ejemplo n.º 1
0
    def getClient(self):
        """Returns a spreadsheet client if there's an Internet connection.

        Returns None if no connection.
        """
        client = SpreadsheetsService()

        try:
            client.GetWorksheetsFeed(self.spreadsheet_key,
                                     visibility='public',
                                     projection='basic')
        except gaierror:
            client = None

        return client
Ejemplo n.º 2
0
def get_cos_contributors():
    ''' returns a list of contribtors from a google spreadsheet
    specified by spreadsheet ID '''

    client = SpreadsheetsService()

    feed = client.GetWorksheetsFeed(DOC_KEY, visibility='public', wksht_id='1')
    sheet_key = feed.id.text.rsplit('/', 1)[1]

    list_of_rows = client.GetListFeed(DOC_KEY, sheet_key,
                                      visibility='public').entry
    print('There are {} rows on the COS spreadsheet.'.format(
        len(list_of_rows)))

    cos_users = []
    for row in list_of_rows:
        row_dict = row.custom
        cos_users.append(row_dict[COL_ROW_NAME].text)

    return cos_users
Ejemplo n.º 3
0
class GoogleSpreadsheetReader(object):
    """
    Directs Google Spreadsheets service client to obtain spreadsheet cells.
    """
    def __init__(self, options):
        """Init the Google Spreadsheets service client."""
        self.options = options
        self.service = GoogleSpreadsheetsService()
        if not self.options.google_email:
            print "Warning: Google account email not provided."
        if not self.options.google_password:
            print "Warning: Google account password not provided."
        self.service.email = self.options.google_email
        self.service.password = self.options.google_password
        self.service.ProgrammaticLogin()
        if not self.options.google_spreadsheet_key:
            print "Warning: Google spreadsheet key not provided."

    def get_cells(self, sheet_index=0):
        """Returns a dict of cell data keyed by cell coordinate (row, col)."""
        cells = {}
        spreadsheet_key = self.options.google_spreadsheet_key
        sheets_feed = self.service.GetWorksheetsFeed(spreadsheet_key)
        sheet_id = sheets_feed.entry[sheet_index].id.text.split('/')[-1]
        cells_feed = self.service.GetCellsFeed(spreadsheet_key, sheet_id)
        for entry in cells_feed.entry:
            try:
                row_id = entry.cell.row
                col_id = entry.cell.col
                data = entry.content.text
            except Exception, inst:
                msg = "Couldn't read cell feed entry: %s" % inst
                msg += "\n%s" % entry
                raise Exception, msg
            try:
                row_id = int(row_id)
                col_id = int(col_id)
            except:
                continue
            cells[(row_id, col_id)] = data
        return cells
Ejemplo n.º 4
0
def load_google_spreadsheet(spreadsheet_key, worksheet_name=None):
    """Downloads and exports a Google Spreadsheet in TSV format.

    Returns a string containing the spreadsheet contents in TSV format (e.g.
    for writing out to a file or parsing).

    The first line is assumed to be the spreadsheet header (i.e. containing
    column names), which can optionally be followed by one or more comment
    lines (starting with '#'). Only the first cell of a comment line will be
    parsed (to keep exported spreadsheets consistent with QIIME mapping files'
    comments). The (optional) comments section is then followed by the
    spreadsheet data.

    Some of this code is based on the following websites, as well as the
    gdata.spreadsheet.text_db module:
        http://www.payne.org/index.php/Reading_Google_Spreadsheets_in_Python
        http://stackoverflow.com/a/12031835

    Arguments:
        spreadsheet_key - the key used to identify the spreadsheet (a string).
            Can either be a key or a URL containing the key
        worksheet_name - the name of the worksheet to load data from (a
            string). If not supplied, will use first worksheet in the
            spreadsheet
    """
    spreadsheet_key = _extract_spreadsheet_key_from_url(spreadsheet_key)
    gd_client = SpreadsheetsService()

    try:
        worksheets_feed = gd_client.GetWorksheetsFeed(spreadsheet_key,
                                                      visibility='public',
                                                      projection='basic')
    except gaierror:
        raise GoogleSpreadsheetConnectionError("Could not establish "
                                               "connection with server. Do "
                                               "you have an active Internet "
                                               "connection?")

    if len(worksheets_feed.entry) < 1:
        raise GoogleSpreadsheetError("The Google Spreadsheet with key '%s' "
                                     "does not have any worksheets associated "
                                     "with it." % spreadsheet_key)

    # Find worksheet that will be exported. If a name has not been provided,
    # use the first worksheet.
    worksheet = None
    if worksheet_name is not None:
        for sheet in worksheets_feed.entry:
            if sheet.title.text == worksheet_name:
                worksheet = sheet

        if worksheet is None:
            raise GoogleSpreadsheetError("The worksheet name '%s' could not "
                                         "be found in the Google Spreadsheet "
                                         "with key '%s'." %
                                         (worksheet_name, spreadsheet_key))
    else:
        # Choose the first one.
        worksheet = worksheets_feed.entry[0]

    # Extract the ID of the worksheet.
    worksheet_id = worksheet.id.text.split('/')[-1]

    # Now that we have a spreadsheet key and worksheet ID, we can read the
    # data. First get the headers (first row). We need this in order to grab
    # the rest of the actual data in the correct order (it is returned
    # unordered).
    headers = _get_spreadsheet_headers(gd_client, spreadsheet_key,
                                       worksheet_id)
    if len(headers) < 1:
        raise GoogleSpreadsheetError("Could not load spreadsheet header (it "
                                     "appears to be empty). Is your Google "
                                     "Spreadsheet with key '%s' empty?" %
                                     spreadsheet_key)

    # Loop through the rest of the rows and build up a list of data (in the
    # same row/col order found in the spreadsheet).
    spreadsheet_lines = _export_spreadsheet(gd_client, spreadsheet_key,
                                            worksheet_id, headers)

    out_lines = StringIO()
    tsv_writer = writer(out_lines, delimiter='\t', lineterminator='\n')
    tsv_writer.writerows(spreadsheet_lines)
    return out_lines.getvalue()
class SheetQuerier:
    key = "0AkGO8UqErL6idDhYYjg1ZXlORnRaM3ZhTks4Z3FrYlE"
    worksheets = {"DVD": "movietitle", "Blu-ray": "movietitle"}
    cache = "spreadsheet_cache.p"
    cache_timeout = 60 * 60 * 24 * 7
    prefixes = ["The", "A"]
    whitespace_pattern = re.compile('[^a-zA-Z0-9 ]+', re.UNICODE)

    def __init__(self, force_reload=False):
        try:
            pickle_date, self.data = pickle.load(open(self.cache, "rb"))
            if DEBUG:
                print "Loaded cache successfully"
            if force_reload or time.time() - pickle_date > self.cache_timeout:
                if DEBUG:
                    if force_reload:
                        print "Cache update forced."
                    else:
                        print "Cache too old, reloading."
                self.reload_data()
                pickle.dump((time.time(), self.data), open(self.cache, "wb"))
        except:
            if DEBUG:
                print "Problem loading cache, reloading."
            self.reload_data()
            pickle.dump((time.time(), self.data), open(self.cache, "wb"))

    def reload_data(self):
        self.client = SpreadsheetsService()
        feed = self.client.GetWorksheetsFeed(self.key,
                                             visibility='public',
                                             projection='basic')
        self.data = {}
        for entry in feed.entry:
            if entry.title.text in self.worksheets.keys():
                bad_rows, total_rows = self.process_sheet(
                    entry.id.text.split("/")[-1],
                    self.worksheets[entry.title.text])
                print "Skipped %d / %d rows in sheet \"%s\"" % (
                    bad_rows, total_rows, entry.title.text)
            elif DEBUG:
                print "Skipped sheet \"%s\"" % entry.title.text

    def process_sheet(self,
                      sheet_key,
                      movie_row_key,
                      type_row_key="forcedsubtitletype"):
        if DEBUG:
            print "Document: %s" % self.key
            print "Sheet: %s" % sheet_key

        rows = self.client.GetListFeed(self.key,
                                       sheet_key,
                                       visibility='public',
                                       projection='values').entry
        bad_rows = 0
        for row in rows:
            try:
                self.data[SheetQuerier.clean_title(
                    row.custom[movie_row_key].text.strip(
                    ))] = row.custom[type_row_key].text.strip()
            except:
                bad_rows += 1
        return bad_rows, len(rows)

    def query_exact(self, title):
        query = SheetQuerier.clean_title(title)
        if query in self.data:
            return self.data[query]
        else:
            return False

    @staticmethod
    def clean_title(title):
        # Move prefixes
        for prefix in SheetQuerier.prefixes:
            if title.endswith(prefix):
                title = "%s %s" % (prefix, title[:-1 * len(prefix) - 2])
                break
        # Strip all non alpha-numeric characters
        title = SheetQuerier.whitespace_pattern.sub('', title)

        # Return the lowercase version
        return title.lower()
Ejemplo n.º 6
0
class GDataClient(object):
    gd_client = None
    gd_cur_ss_key = None
    gd_cur_ws_id = None
    
    def __init__(self,uname,pswd,document_name=None,worksheet_name=None):
        # Connect to Google
        self.gd_client = SpreadsheetsService(email=uname,password=pswd,source=GDATA_SOURCE)
        self.gd_client.ProgrammaticLogin()
        if document_name is not None:
            self.set_document(document_name)
        if worksheet_name is not None:
            self.set_worksheet(worksheet_name)
    
    def __ss_check(self):
        ''' Make sure spreadsheet has been set before we try to do anything with worksheets.
        '''
        if self.gd_cur_ss_key is None:
            raise GDError('Must set spreadsheet before accessing worksheets!')

    def __header_to_key(self,hdr_string):
        ''' Google sheets column headers are used as keys to row info dictionaries, but first most non alphanumeric 
        characters are removed and the letters are lower-cased.
            
        '''
        return ''.join(re.findall('[a-z\-0-9\.]+',hdr_string.lower()))

    def set_document(self, docname):
        ''' Set current spreadsheet document given a title.
        '''
        q = DocumentQuery(params={'title':docname,'title-exact':'true'})
        ss_feed = self.gd_client.GetSpreadsheetsFeed(query=q)
        if len(ss_feed.entry) != 1:
            raise GDError('{} spreadsheets match the given name "{}" (expected exactly one)!'.format(len(ss_feed.entry),docname))
        self.gd_cur_ss_key = ss_feed.entry[0].id.text.rsplit('/',1)[1]

    def list_documents(self):
        ''' List all spreadsheet documents available.
        '''
        feed = self.gd_client.GetSpreadsheetsFeed(DocumentQuery())
        return [en.title.text for en in feed.entry]

    def list_worksheets(self):
        ''' List all worksheets in the current spreadsheet document.
        '''
        self.__ss_check()
        ws_feed = self.gd_client.GetWorksheetsFeed(self.gd_cur_ss_key)
        return [en.title.text for en in ws_feed.entry]

    def set_worksheet(self, sheetname):
        ''' Set current worksheet within the current spreadsheet document.
        '''
        self.__ss_check()
        q = DocumentQuery(params={'title':sheetname,'title-exact':'true'})
        ws_feed = self.gd_client.GetWorksheetsFeed(self.gd_cur_ss_key,query=q)
        if len(ws_feed.entry) != 1:
            raise GDError('{} worksheets match the given name "{}" (expected exactly one)!'.format(len(ws_feed.entry),sheetname))
        self.gd_cur_ws_id = ws_feed.entry[0].id.text.rsplit('/',1)[1]

    def add_worksheet(self, title, rows, cols, overwrite=False):
        ''' Add a worksheet to current spreadsheet document (if it does not exist).  Switch current sheet to the (new) guy.
        '''
        self.__ss_check()
        
        # First, check to see if the worksheet already exists
        q = DocumentQuery(params={'title':title,'title-exact':'true'})
        ws_feed = self.gd_client.GetWorksheetsFeed(self.gd_cur_ss_key,query=q)
        ws_found = None
        if len(ws_feed.entry) > 0:
            if overwrite:
                if not self.gd_client.DeleteWorksheet(ws_feed.entry[0]):
                    raise GDError('Failed to delete existing worksheet named {} to overwrite!'.format(title))
            else:
                ws_found = ws_feed.entry[0]

        is_new = ws_found is None
        if is_new:
            ws_found = self.gd_client.AddWorksheet(title,rows,cols,self.gd_cur_ss_key)

        self.gd_cur_ws_id = ws_found.id.text.rsplit('/',1)[1]

        return is_new

    def set_headers(self,header_names):
        ''' Set the current worksheet headers given a list.
        NOTE: header list length must match columns in sheet!
        '''
        self.__ss_check()
        ws = self.gd_client.GetWorksheetsFeed(self.gd_cur_ss_key,self.gd_cur_ws_id)
        if int(ws.col_count.text) != len(header_names):
            raise GDError('Number of headers ({}) does not match columns in spreadsheet ({})!'
                          .format(len(header_names),int(ws.col_count.text)))

        query = CellQuery(params={'min-row':'1','max-row':'1','min-col':'1','max-col':str(len(header_names))})
        cells = self.gd_client.GetCellsFeed(self.gd_cur_ss_key,self.gd_cur_ws_id,query=query)
        if len(cells.entry) == 0:
            query = CellQuery(params={'min-row':'1','max-row':'1','min-col':'1','max-col':str(len(header_names)),'return-empty':'true'})
            cells = self.gd_client.GetCellsFeed(self.gd_cur_ss_key,self.gd_cur_ws_id,query=query)

        batchRequest = SpreadsheetsCellsFeed()

        for idx,val in enumerate(header_names):
            cells.entry[idx].cell.inputValue = val
            batchRequest.AddUpdate(cells.entry[idx])

        updated = self.gd_client.ExecuteBatch(batchRequest,cells.GetBatchLink().href)


    def delete_rows(self,row_list):
        ''' Given a list of row numbers, delete the corresponding rows from the current sheet.
        @param row_list List of integers in 0:num_rows
        '''
        if len(row_list) == 0:
            return

        self.__ss_check()
        list_feed = self.gd_client.GetListFeed(self.gd_cur_ss_key, self.gd_cur_ws_id)
        ordered_list = sorted(row_list,reverse=True)
        if ordered_list[0] > int(list_feed.total_results.text):
            raise GDError('Tried to delete row {} but highest row number in sheet is {}!'.format(ordered_list[0],list_feed.total_results.text))

        for row in ordered_list:
            if not self.gd_client.DeleteRow(list_feed.entry[row]):
                raise GDError('Failed to delete row {} (partway through list: {})'.format(row,ordered_list))        
        
    def insert_rows(self,info_dict_list):
        ''' Add a group of rows based on list of dictionaries.

        @param info_dict_list Dictionary with keys for any headers with non-blank info in the new row 
        (all missing keys will be have blank data).
        '''
        self.__ss_check()
        # Check and make sure that none of the input dictionaries contains keys not in the first row of our list.
        hdr_list = [self.__header_to_key(hdr) for hdr in self.row_as_list(1)]
        bad_row = next((dd for dd in info_dict_list if len(set(dd.keys()) - set(hdr_list)) > 0), None)
        if bad_row is not None:
            raise GDError('Failed to insert row {} because it contains keys not in spreadsheet headers ({})!'
                          .format(bad_row,hdr_list))

        list_feed = self.gd_client.GetListFeed(self.gd_cur_ss_key, self.gd_cur_ws_id)
        for inf in info_dict_list:
            self.gd_client.InsertRow(inf,self.gd_cur_ss_key, self.gd_cur_ws_id)

    def column_as_list(self,column,with_header=False):
        ''' Read just a single column into a list of strings.

        Ignore the first row by default because it's the column header.
        '''
        self.__ss_check()
        if with_header:
            minrow = 1
        else:
            minrow = 2
        ws = self.gd_client.GetWorksheetsFeed(self.gd_cur_ss_key,self.gd_cur_ws_id)
        if int(ws.row_count.text) < minrow:
            return []

        q = CellQuery(params={'min-row':str(minrow),'min-col':str(column),'max-col':str(column)})
        cells = self.gd_client.GetCellsFeed(self.gd_cur_ss_key,self.gd_cur_ws_id,query=q)

        return [cellentry.cell.text for cellentry in cells.entry]

    def row_as_list(self,row):
        ''' Read just a single row into a list of strings.

        NOTE: this is indexed by cell, so row 1 is the header row!
        '''
        self.__ss_check()
        mincol = 1
        ws = self.gd_client.GetWorksheetsFeed(self.gd_cur_ss_key,self.gd_cur_ws_id)
        if int(ws.col_count.text) < mincol:
            return []

        q = CellQuery(params={'min-row':str(row),'min-col':str(mincol),'max-row':str(row)})
        cells = self.gd_client.GetCellsFeed(self.gd_cur_ss_key,self.gd_cur_ws_id,query=q)

        return [cellentry.cell.text for cellentry in cells.entry]
        

    def read_to_list(self,num_lines=None):
        """ Read the sheet into a list of lists of strings """
        self.__ss_check()
        if num_lines is not None:
            q = DocumentQuery(params={'max-results':'%d'%num_lines})
            list_feed = self.gd_client.GetListFeed(self.gd_cur_ss_key, self.gd_cur_ws_id,query=q)
        else:
            list_feed = self.gd_client.GetListFeed(self.gd_cur_ss_key, self.gd_cur_ws_id)

        string_list = []
        for i, entry in enumerate(list_feed.entry):
            row = {key:entry.custom[key].text for key in entry.custom}
            row['rowname'] = entry.title.text
            string_list.append(row)
        
        return string_list

    def read_to_dict(self,key_column_name,row_start=None,row_num=None):
        """ Read the sheet into a dictionary with keys given by the named column.
        
        NOTE: Raises an error on any redundant rows.
        """

        self.__ss_check()

        # Use this for testing to limit number of results handled:
        params = {}
        if row_start is not None:
            params['start-index'] = '%d'%row_start
        if row_num is not None:
            params['max-results'] = '%d'%row_num
        if len(params) != 0:
            q = ListQuery(params=params)
            list_feed = self.gd_client.GetListFeed(self.gd_cur_ss_key, self.gd_cur_ws_id,query=q)
        else:
            list_feed = self.gd_client.GetListFeed(self.gd_cur_ss_key, self.gd_cur_ws_id)

        multi_rows = []
        string_dict = {}
        for i, entry in enumerate(list_feed.entry):
            # If we get the title column, ignore it.
            if entry.custom[key_column_name].text.replace(' ','').lower() == key_column_name:
                continue
            row = {key:entry.custom[key].text for key in entry.custom if key is not key_column_name}
            row['rowname'] = entry.title.text
            key_name = entry.custom[key_column_name].text
            if key_name in string_dict:
                multi_rows.append(key_name)
            string_dict[key_name] = row

        if len(multi_rows) > 0:
            errors = 'read_to_dict -- Column {} contains multiple rows with each of the following values: {}'.format(key_column_name,multi_rows)
            # raise GDError('read_to_dict -- Column {} contains multiple rows with each of the following values: {}'.format(key_column_name,multi_rows))
        else:
            errors = None

        return string_dict,errors