Exemple #1
0
def getSheetData(spreadsheet=SPREADSHEET, worksheet=WORKSHEET, headers=HEADERS):
    """
    Get the data of the worksheet as a list of rows, each row a dictionary of
    the headers
    """
    client = SpreadsheetsService()

    cols = dict(zip(string.uppercase, headers))
    rLabel = 0
    niceRows = []

    feedArgs = dict(key=spreadsheet, wksht_id=worksheet, visibility='public',
            projection='basic')

    cells = client.GetCellsFeed(**feedArgs)
    for cell in cells.entry:
        label = re.match('^([A-Z]+)([0-9]+)$', cell.title.text)
        cLabel = label.group(1)
        _rLabel = int(label.group(2))
        if _rLabel == 1:
            continue # header row
        if _rLabel > rLabel:
            niceRow = {}
            niceRows.append(niceRow)
            rLabel = _rLabel
        niceRow[cols[cLabel]] = cell.content.text

    return niceRows
class GoogleConn:
    def __init__(self, usr, pwd, form):
        global FORM_COLS
        self.gc = SpreadsheetsService()
        self.gc.ClientLogin(username=usr, password=pwd, source=APP_NAME)
        query = DocumentQuery()
        query.title = form
        feed = self.gc.GetSpreadsheetsFeed(query=query)
        if not feed.entry: raise Exception, "empty spreadsheet list"
        self.formid = feed.entry[0].id.text.rsplit('/', 1)[1]
        #        self.update = feed.entry[0].updated.text
        query = CellQuery()
        query.range = '%c1:%c1' % tuple(i + ord('A')
                                        for i in (0, len(FORM_KEYS) - 1))
        feed = self.gc.GetCellsFeed(self.formid, query=query)
        FORM_COLS = tuple(''.join(c for c in cell.content.text.lower()
                                  if c.isalnum() or c in '-_')
                          for cell in feed.entry)
        self.statcol = FORM_COLS[FORM_KEYS.index('status')]

    def get_requests(self):
        feed = self.gc.GetListFeed(self.formid)
        return [
            Request(row) for row in feed.entry
            if not row.custom[self.statcol].text
        ]

    def update_request(self, request):
        request.entry = self.gc.UpdateRow(request.entry, request.data())
Exemple #3
0
 def __init__(self,uname,pswd,document_name=None,worksheet_name=None):
     # Connect to Google
     self.gd_client = SpreadsheetsService(email=uname,password=pswd,source=GDATA_SOURCE)
     self.gd_client.ProgrammaticLogin()
     if document_name is not None:
         self.set_document(document_name)
     if worksheet_name is not None:
         self.set_worksheet(worksheet_name)
Exemple #4
0
    def __init__(self, user, pw):
        self.document_name = ''
        self.current_key = ''

        # Login process
        self.gd_client = SpreadsheetsService()
        self.gd_client.email = user
        self.gd_client.password = pw
        self.gd_client.ProgrammaticLogin()
Exemple #5
0
 def sheets_service(self):
     """ A SpreadsheetsService singleton, used to perform
     operations on the actual spreadsheet. """
     if not hasattr(self, '_sheets_service'):
         service = SpreadsheetsService()
         service.email = self.google_user
         service.password = self.google_password
         service.source = SOURCE_NAME
         service.ProgrammaticLogin()
         self._sheets_service = service
     return self._sheets_service
Exemple #6
0
def Auth(email=None, password=None):
    """Get a reusable google data client."""
    gd_client = SpreadsheetsService()
    gd_client.source = "texastribune-ttspreadimporter-1"
    if email is None:
        email = os.environ.get('GOOGLE_ACCOUNT_EMAIL')
    if password is None:
        password = os.environ.get('GOOGLE_ACCOUNT_PASSWORD')
    if email and password:
        gd_client.ClientLogin(email, password)
    return gd_client
Exemple #7
0
    def getClient(self):
        """Returns a spreadsheet client if there's an Internet connection.

        Returns None if no connection.
        """
        client = SpreadsheetsService()

        try:
            client.GetWorksheetsFeed(self.spreadsheet_key, visibility='public',
                                     projection='basic')
        except gaierror:
            client = None

        return client
 def reload_data(self):
     self.client = SpreadsheetsService()
     feed = self.client.GetWorksheetsFeed(self.key,
                                          visibility='public',
                                          projection='basic')
     self.data = {}
     for entry in feed.entry:
         if entry.title.text in self.worksheets.keys():
             bad_rows, total_rows = self.process_sheet(
                 entry.id.text.split("/")[-1],
                 self.worksheets[entry.title.text])
             print "Skipped %d / %d rows in sheet \"%s\"" % (
                 bad_rows, total_rows, entry.title.text)
         elif DEBUG:
             print "Skipped sheet \"%s\"" % entry.title.text
 def __init__(self, usr, pwd, form):
     global FORM_COLS
     self.gc = SpreadsheetsService()
     self.gc.ClientLogin(username=usr, password=pwd, source=APP_NAME)
     query = DocumentQuery()
     query.title = form
     feed = self.gc.GetSpreadsheetsFeed(query=query)
     if not feed.entry: raise Exception, "empty spreadsheet list"
     self.formid = feed.entry[0].id.text.rsplit('/', 1)[1]
     #        self.update = feed.entry[0].updated.text
     query = CellQuery()
     query.range = '%c1:%c1' % tuple(i + ord('A')
                                     for i in (0, len(FORM_KEYS) - 1))
     feed = self.gc.GetCellsFeed(self.formid, query=query)
     FORM_COLS = tuple(''.join(c for c in cell.content.text.lower()
                               if c.isalnum() or c in '-_')
                       for cell in feed.entry)
     self.statcol = FORM_COLS[FORM_KEYS.index('status')]
Exemple #10
0
def get_cos_contributors():
    ''' returns a list of contribtors from a google spreadsheet
    specified by spreadsheet ID '''

    client = SpreadsheetsService()

    feed = client.GetWorksheetsFeed(DOC_KEY, visibility='public', wksht_id='1')
    sheet_key = feed.id.text.rsplit('/', 1)[1]

    list_of_rows = client.GetListFeed(DOC_KEY, sheet_key,
                                      visibility='public').entry
    print('There are {} rows on the COS spreadsheet.'.format(
        len(list_of_rows)))

    cos_users = []
    for row in list_of_rows:
        row_dict = row.custom
        cos_users.append(row_dict[COL_ROW_NAME].text)

    return cos_users
Exemple #11
0
class GoogleSpreadsheetReader(object):
    """
    Directs Google Spreadsheets service client to obtain spreadsheet cells.
    """
    def __init__(self, options):
        """Init the Google Spreadsheets service client."""
        self.options = options
        self.service = GoogleSpreadsheetsService()
        if not self.options.google_email:
            print "Warning: Google account email not provided."
        if not self.options.google_password:
            print "Warning: Google account password not provided."
        self.service.email = self.options.google_email
        self.service.password = self.options.google_password
        self.service.ProgrammaticLogin()
        if not self.options.google_spreadsheet_key:
            print "Warning: Google spreadsheet key not provided."

    def get_cells(self, sheet_index=0):
        """Returns a dict of cell data keyed by cell coordinate (row, col)."""
        cells = {}
        spreadsheet_key = self.options.google_spreadsheet_key
        sheets_feed = self.service.GetWorksheetsFeed(spreadsheet_key)
        sheet_id = sheets_feed.entry[sheet_index].id.text.split('/')[-1]
        cells_feed = self.service.GetCellsFeed(spreadsheet_key, sheet_id)
        for entry in cells_feed.entry:
            try:
                row_id = entry.cell.row
                col_id = entry.cell.col
                data = entry.content.text
            except Exception, inst:
                msg = "Couldn't read cell feed entry: %s" % inst
                msg += "\n%s" % entry
                raise Exception, msg
            try:
                row_id = int(row_id)
                col_id = int(col_id)
            except:
                continue
            cells[(row_id, col_id)] = data
        return cells
Exemple #12
0
 def __init__(self, options):
     """Init the Google Spreadsheets service client."""
     self.options = options
     self.service = GoogleSpreadsheetsService()
     if not self.options.google_email:
         print "Warning: Google account email not provided."
     if not self.options.google_password:
         print "Warning: Google account password not provided."
     self.service.email = self.options.google_email
     self.service.password = self.options.google_password
     self.service.ProgrammaticLogin()
     if not self.options.google_spreadsheet_key:
         print "Warning: Google spreadsheet key not provided."
Exemple #13
0
class SpreadSheet:
    def __init__(self, user, pw):
        self.document_name = ''
        self.current_key = ''

        # Login process
        self.gd_client = SpreadsheetsService()
        self.gd_client.email = user
        self.gd_client.password = pw
        self.gd_client.ProgrammaticLogin()

    def set_document(self, name):
        self.document_name = name
        self._get_current_key()

    def _get_current_key(self):
        feed = self.gd_client.GetSpreadsheetsFeed()
        for entry in feed.entry:
            if entry.title.text == self.document_name:
                self.current_key = entry.id.text.split('/')[-1]
                break
            else:
                raise ValueError('document no found')

    def _get_worksheet_index(self):
        wksht_idx = len(
            self.gd_client.GetWorksheetsFeed(key=self.current_key).entry) - 1
        #print "Worksheet Index:", wksht_idx
        return wksht_idx

    def get_current_key(self):
        return self.current_key

    def get_worksheet_id(self):
        wksht_idx = self._get_worksheet_index()
        wksht_id = self.gd_client.GetWorksheetsFeed(
            key=self.current_key).entry[wksht_idx].id.text.rsplit('/', 1)[1]

        #print "Worksheet ID:", wksht_id
        return wksht_id

    def create_worksheet(self, name):
        worksheet = self.gd_client.AddWorksheet(title=name,
                                                row_count=100,
                                                col_count=20,
                                                key=self.current_key)

    def insert_row(self, data, wksht_id):
        for i, row in enumerate(data):
            for j, value in enumerate(row):
                self.gd_client.UpdateCell(row=i + 1,
                                          col=(j + 1),
                                          inputValue=value,
                                          key=self.current_key,
                                          wksht_id=wksht_id)
Exemple #14
0
 def sheets_service(self):
     """ A SpreadsheetsService singleton, used to perform
     operations on the actual spreadsheet. """
     if not hasattr(self, '_sheets_service'):
         service = SpreadsheetsService()
         service.email = self.google_user
         service.password = self.google_password
         service.source = SOURCE_NAME
         service.ProgrammaticLogin()
         self._sheets_service = service
     return self._sheets_service
Exemple #15
0
###############################################################################

SERVICE_ACCOUNT_EMAIL = '*****@*****.**'
PROJECT_NUMBER = '456961407722'

pemfile = file('tinkeracademy.pem', 'rb')
key = pemfile.read()
pemfile.close()

credentials = None

googlehttp = None

GOOGLE_DRIVE_SERVICE = None

GOOGLE_SPREADSHEETS_SERVICE = SpreadsheetsService()

###############################################################################
## Code below updates the tokens
###############################################################################


def updatetokens():
    global credentials
    credentials = SignedJwtAssertionCredentials(
        SERVICE_ACCOUNT_EMAIL,
        key,
        scope=[
            constants.GOOGLE_DRIVE_SCOPE, constants.GOOGLE_SPREADSHEETS_SCOPE
        ],
        sub='*****@*****.**')
Exemple #16
0
def main(argv=sys.argv):
    if len(argv) < 2:
        usage(argv)
    config_uri = argv[1]
    options = parse_vars(argv[2:])
    setup_logging(config_uri)
    settings = get_appsettings(config_uri, options=options)
    engine = engine_from_config(settings, 'sqlalchemy.')
    DBSession.configure(bind=engine)
    Base.metadata.drop_all(engine)
    Base.metadata.create_all(engine)
    with transaction.manager:
        user = User(username='******', password='******', email=settings.get('admin_mail'), activated=True,
                    groups='admin')
        DBSession.add(user)
        DBSession.flush()
        for k, v in Settings.settings.iteritems():
            if type(v) == dict:
                DBSession.add(Settings(key=unicode(k), value=unicode(v['value'])))
            else:
                if isinstance(v, basestring) and v.startswith('app_config.'):
                    v = settings.get(v[len('app_config.'):])
                DBSession.add(Settings(key=unicode(k), value=unicode(v)))

        DBSession.flush()

        print 'Starting to import data from spreadsheets'
        # if 1:
        if 0:
            from gdata.spreadsheet.service import SpreadsheetsService

            founder_columns = {}
            founders = []
            to_founder_keys = {'Technical Co-Founder List?': 'technical',
                               'Email': 'email',
                               'Address': 'address',
                               'Alchemist Class #': 'alchemistclass',
                               'Public LinkedIn Profile': 'linkedin_profile',
                               'Areas of Expertise': 'tags',
                               'First Name': 'firstname',
                               'Last Name': 'lastname',
                               'Are you the CEO? (One and only one co-founder must be designated CEO)': 'ceo',
                               'Phone (Other)': '',
                               'Title': 'title',
                               'Are you an IoT company?': 'company.iot',
                               'Base Row': '',
                               'Company': 'company',
                               'Phone (Mobile)': 'phone',
                               }
            to_company_keys = {
                'name': 'name',
                'description': 'description',
                'twitterhandle': 'twitterhandle',
                'email': 'email',
                'website': 'website',
                'tags': 'tags',
                'braggingtext': 'braggingtext',
                'totalraise': 'totalraise',
                'haveraise': 'haveraise',
                'logo': 'logo',
                'alchemistclass': 'alchemistclass'
            }
            ss = SpreadsheetsService()
            ss.email = 'email'
            ss.password = '******'
            ss.ProgrammaticLogin()

            for shkey in []:
                print 'Starting to import companies from %s' % shkey
                linenr = 0
                linestoskip = 7
                headerline = 2
                company_columns = {}
                for t in ss.GetListFeed(shkey).entry:
                    linenr += 1
                    if headerline == linenr:
                        for k, v in t.custom.iteritems():
                            if (v.text or '').strip() in to_company_keys:
                                company_columns[k] = to_company_keys[v.text.strip()]
                        print 'Got company_columns %s' % company_columns
                    if linenr <= linestoskip:
                        print 'skipping line %s' % linenr
                        continue
                    company_values = {}
                    for k, v in t.custom.iteritems():
                        if k in company_columns:
                            val = (getattr(v, 'text', '') or '').strip()
                            if company_columns[k] in BaseCompany.boolean_fields:
                                val = val.lower() == 'yes'
                            company_values[company_columns[k]] = val
                    company_values['tags'] = handle_if_tags('tags', company_values.get('tags', '').strip())
                    if not (company_values and company_values.get('name', '').strip() and
                                company_values.get('description', '').strip()):
                        print 'skipping because no values %s ' % company_values
                        continue
                    if 'logo' in company_values:
                        logo_ = 'http%s' % company_values['logo'].split('http')[-1]
                        if 'dropbox' in logo_:
                            logo_ = logo_.split('?')[0] + '?raw=1'
                        elif 'google' in logo_:
                            logo_ = 'https://drive.google.com/uc?export=view&id=%s' % \
                                    logo_.split('/')[-2]
                        company_values['logo'] = logo_

                    company = DBSession.query(BaseCompany).filter(
                        BaseCompany.name == company_values['name']).first()
                    if company and company.description and company.email:
                        print 'not importing %s already filled some info' % company.name
                        continue
                    elif company:
                        for cf, cv in company_values.iteritems():
                            if hasattr(company, cf):
                                setattr(company, cf, cv)
                    else:
                        DBSession.add(BaseCompany(**company_values))

            DBSession.flush()

            print 'stating to import founders'

            for t in ss.GetListFeed('0As9IMrvjc_FqdGdiOHN3ZWhveWg5eHF1WVdmbGI4Vnc').entry:
                if not founder_columns:
                    for k, v in t.custom.iteritems():
                        founder_columns[k] = v.text
                else:
                    founder = {}
                    for k, v in t.custom.iteritems():
                        if k in founder_columns and founder_columns[k] and \
                                to_founder_keys[founder_columns[k].strip()]:
                            founder[to_founder_keys[founder_columns[k].strip()]] = v.text
                    founders.append(founder)

            for founder in founders:
                company_name = founder.pop('company')
                is_iot = founder.pop('company.iot') if 'company.iot' in founder else False
                address = founder.pop('address') if 'address' in founder else ''
                try:
                    if founder.get('email', '').strip():
                        assert re.match(r'[^@]+@[^@]+\.[^@]+', founder['email'].strip()), 'Invalid email address'
                except:
                    print 'invalid email %s' % founder['email'] if 'email' in founder else '(no email)'
                    founder['email'] = '*****@*****.**'
                if company_name:
                    for c_ext in ['INC', 'Inc.', 'Inc', 'LLC', 'llc']:
                        company_name = company_name.replace(c_ext, '').strip()
                    company = DBSession.query(BaseCompany).filter(
                        func.lower(BaseCompany.name) == func.lower(company_name)).first()
                    for split_str in ['/', '  ', ' ']:
                        if company:
                            break
                        for cn_split in company_name.split(split_str):
                            company = DBSession.query(BaseCompany).filter(
                                func.lower(BaseCompany.name).startswith(func.lower(cn_split.strip()))).first()
                            if company:
                                break
                if company:
                    company_id = company.id
                    if is_iot and is_iot.lower() == 'yes':
                        company.iot = True
                    c_address = getattr(company, 'address', '')
                    if address and (not c_address or not str(c_address).strip()):
                        company.address = address
                else:
                    company = BaseCompany(name=company_name, iot=(is_iot and is_iot.lower() == 'yes'),
                                          address=address)
                    DBSession.add(company)
                    DBSession.flush()
                    company_id = company.id
                for k in ['ceo', 'technical']:
                    if k not in founder:
                        founder[k] = False
                    founder[k] = str(founder[k]).lower() == 'yes'
                if company and not company.alchemistclass and 'alchemistclass' in founder and founder[
                    'alchemistclass'].strip():
                    try:
                        company.alchemistclass = int(founder['alchemistclass'].strip())
                    except:
                        pass
                if 'ceo' in founder and founder['ceo'] and not ('CEO' in founder['title'] or 'C.E.O' in founder['title']
                                                                or 'Chief Exec' in founder['title']):
                    founder['title'] += (', ' if founder['title'].strip() else '') + 'CEO'

                if 'ceo' in founder and 'title' in founder and \
                        not founder['ceo'] and ('CEO' in founder['title'] or 'C.E.O' in founder['title']
                                                or 'Chief Exec' in founder['title']):
                    founder['ceo'] = True
                founder['company_id'] = company_id
                founder['activated'] = True
                founder['active'] = True

                founder['tags'] = handle_if_tags('tags', founder.get('tags', '').strip(), [',', 'and', ';'])

                DBSession.add(Founder(**founder))
Exemple #17
0
class GDataClient(object):
    gd_client = None
    gd_cur_ss_key = None
    gd_cur_ws_id = None
    
    def __init__(self,uname,pswd,document_name=None,worksheet_name=None):
        # Connect to Google
        self.gd_client = SpreadsheetsService(email=uname,password=pswd,source=GDATA_SOURCE)
        self.gd_client.ProgrammaticLogin()
        if document_name is not None:
            self.set_document(document_name)
        if worksheet_name is not None:
            self.set_worksheet(worksheet_name)
    
    def __ss_check(self):
        ''' Make sure spreadsheet has been set before we try to do anything with worksheets.
        '''
        if self.gd_cur_ss_key is None:
            raise GDError('Must set spreadsheet before accessing worksheets!')

    def __header_to_key(self,hdr_string):
        ''' Google sheets column headers are used as keys to row info dictionaries, but first most non alphanumeric 
        characters are removed and the letters are lower-cased.
            
        '''
        return ''.join(re.findall('[a-z\-0-9\.]+',hdr_string.lower()))

    def set_document(self, docname):
        ''' Set current spreadsheet document given a title.
        '''
        q = DocumentQuery(params={'title':docname,'title-exact':'true'})
        ss_feed = self.gd_client.GetSpreadsheetsFeed(query=q)
        if len(ss_feed.entry) != 1:
            raise GDError('{} spreadsheets match the given name "{}" (expected exactly one)!'.format(len(ss_feed.entry),docname))
        self.gd_cur_ss_key = ss_feed.entry[0].id.text.rsplit('/',1)[1]

    def list_documents(self):
        ''' List all spreadsheet documents available.
        '''
        feed = self.gd_client.GetSpreadsheetsFeed(DocumentQuery())
        return [en.title.text for en in feed.entry]

    def list_worksheets(self):
        ''' List all worksheets in the current spreadsheet document.
        '''
        self.__ss_check()
        ws_feed = self.gd_client.GetWorksheetsFeed(self.gd_cur_ss_key)
        return [en.title.text for en in ws_feed.entry]

    def set_worksheet(self, sheetname):
        ''' Set current worksheet within the current spreadsheet document.
        '''
        self.__ss_check()
        q = DocumentQuery(params={'title':sheetname,'title-exact':'true'})
        ws_feed = self.gd_client.GetWorksheetsFeed(self.gd_cur_ss_key,query=q)
        if len(ws_feed.entry) != 1:
            raise GDError('{} worksheets match the given name "{}" (expected exactly one)!'.format(len(ws_feed.entry),sheetname))
        self.gd_cur_ws_id = ws_feed.entry[0].id.text.rsplit('/',1)[1]

    def add_worksheet(self, title, rows, cols, overwrite=False):
        ''' Add a worksheet to current spreadsheet document (if it does not exist).  Switch current sheet to the (new) guy.
        '''
        self.__ss_check()
        
        # First, check to see if the worksheet already exists
        q = DocumentQuery(params={'title':title,'title-exact':'true'})
        ws_feed = self.gd_client.GetWorksheetsFeed(self.gd_cur_ss_key,query=q)
        ws_found = None
        if len(ws_feed.entry) > 0:
            if overwrite:
                if not self.gd_client.DeleteWorksheet(ws_feed.entry[0]):
                    raise GDError('Failed to delete existing worksheet named {} to overwrite!'.format(title))
            else:
                ws_found = ws_feed.entry[0]

        is_new = ws_found is None
        if is_new:
            ws_found = self.gd_client.AddWorksheet(title,rows,cols,self.gd_cur_ss_key)

        self.gd_cur_ws_id = ws_found.id.text.rsplit('/',1)[1]

        return is_new

    def set_headers(self,header_names):
        ''' Set the current worksheet headers given a list.
        NOTE: header list length must match columns in sheet!
        '''
        self.__ss_check()
        ws = self.gd_client.GetWorksheetsFeed(self.gd_cur_ss_key,self.gd_cur_ws_id)
        if int(ws.col_count.text) != len(header_names):
            raise GDError('Number of headers ({}) does not match columns in spreadsheet ({})!'
                          .format(len(header_names),int(ws.col_count.text)))

        query = CellQuery(params={'min-row':'1','max-row':'1','min-col':'1','max-col':str(len(header_names))})
        cells = self.gd_client.GetCellsFeed(self.gd_cur_ss_key,self.gd_cur_ws_id,query=query)
        if len(cells.entry) == 0:
            query = CellQuery(params={'min-row':'1','max-row':'1','min-col':'1','max-col':str(len(header_names)),'return-empty':'true'})
            cells = self.gd_client.GetCellsFeed(self.gd_cur_ss_key,self.gd_cur_ws_id,query=query)

        batchRequest = SpreadsheetsCellsFeed()

        for idx,val in enumerate(header_names):
            cells.entry[idx].cell.inputValue = val
            batchRequest.AddUpdate(cells.entry[idx])

        updated = self.gd_client.ExecuteBatch(batchRequest,cells.GetBatchLink().href)


    def delete_rows(self,row_list):
        ''' Given a list of row numbers, delete the corresponding rows from the current sheet.
        @param row_list List of integers in 0:num_rows
        '''
        if len(row_list) == 0:
            return

        self.__ss_check()
        list_feed = self.gd_client.GetListFeed(self.gd_cur_ss_key, self.gd_cur_ws_id)
        ordered_list = sorted(row_list,reverse=True)
        if ordered_list[0] > int(list_feed.total_results.text):
            raise GDError('Tried to delete row {} but highest row number in sheet is {}!'.format(ordered_list[0],list_feed.total_results.text))

        for row in ordered_list:
            if not self.gd_client.DeleteRow(list_feed.entry[row]):
                raise GDError('Failed to delete row {} (partway through list: {})'.format(row,ordered_list))        
        
    def insert_rows(self,info_dict_list):
        ''' Add a group of rows based on list of dictionaries.

        @param info_dict_list Dictionary with keys for any headers with non-blank info in the new row 
        (all missing keys will be have blank data).
        '''
        self.__ss_check()
        # Check and make sure that none of the input dictionaries contains keys not in the first row of our list.
        hdr_list = [self.__header_to_key(hdr) for hdr in self.row_as_list(1)]
        bad_row = next((dd for dd in info_dict_list if len(set(dd.keys()) - set(hdr_list)) > 0), None)
        if bad_row is not None:
            raise GDError('Failed to insert row {} because it contains keys not in spreadsheet headers ({})!'
                          .format(bad_row,hdr_list))

        list_feed = self.gd_client.GetListFeed(self.gd_cur_ss_key, self.gd_cur_ws_id)
        for inf in info_dict_list:
            self.gd_client.InsertRow(inf,self.gd_cur_ss_key, self.gd_cur_ws_id)

    def column_as_list(self,column,with_header=False):
        ''' Read just a single column into a list of strings.

        Ignore the first row by default because it's the column header.
        '''
        self.__ss_check()
        if with_header:
            minrow = 1
        else:
            minrow = 2
        ws = self.gd_client.GetWorksheetsFeed(self.gd_cur_ss_key,self.gd_cur_ws_id)
        if int(ws.row_count.text) < minrow:
            return []

        q = CellQuery(params={'min-row':str(minrow),'min-col':str(column),'max-col':str(column)})
        cells = self.gd_client.GetCellsFeed(self.gd_cur_ss_key,self.gd_cur_ws_id,query=q)

        return [cellentry.cell.text for cellentry in cells.entry]

    def row_as_list(self,row):
        ''' Read just a single row into a list of strings.

        NOTE: this is indexed by cell, so row 1 is the header row!
        '''
        self.__ss_check()
        mincol = 1
        ws = self.gd_client.GetWorksheetsFeed(self.gd_cur_ss_key,self.gd_cur_ws_id)
        if int(ws.col_count.text) < mincol:
            return []

        q = CellQuery(params={'min-row':str(row),'min-col':str(mincol),'max-row':str(row)})
        cells = self.gd_client.GetCellsFeed(self.gd_cur_ss_key,self.gd_cur_ws_id,query=q)

        return [cellentry.cell.text for cellentry in cells.entry]
        

    def read_to_list(self,num_lines=None):
        """ Read the sheet into a list of lists of strings """
        self.__ss_check()
        if num_lines is not None:
            q = DocumentQuery(params={'max-results':'%d'%num_lines})
            list_feed = self.gd_client.GetListFeed(self.gd_cur_ss_key, self.gd_cur_ws_id,query=q)
        else:
            list_feed = self.gd_client.GetListFeed(self.gd_cur_ss_key, self.gd_cur_ws_id)

        string_list = []
        for i, entry in enumerate(list_feed.entry):
            row = {key:entry.custom[key].text for key in entry.custom}
            row['rowname'] = entry.title.text
            string_list.append(row)
        
        return string_list

    def read_to_dict(self,key_column_name,row_start=None,row_num=None):
        """ Read the sheet into a dictionary with keys given by the named column.
        
        NOTE: Raises an error on any redundant rows.
        """

        self.__ss_check()

        # Use this for testing to limit number of results handled:
        params = {}
        if row_start is not None:
            params['start-index'] = '%d'%row_start
        if row_num is not None:
            params['max-results'] = '%d'%row_num
        if len(params) != 0:
            q = ListQuery(params=params)
            list_feed = self.gd_client.GetListFeed(self.gd_cur_ss_key, self.gd_cur_ws_id,query=q)
        else:
            list_feed = self.gd_client.GetListFeed(self.gd_cur_ss_key, self.gd_cur_ws_id)

        multi_rows = []
        string_dict = {}
        for i, entry in enumerate(list_feed.entry):
            # If we get the title column, ignore it.
            if entry.custom[key_column_name].text.replace(' ','').lower() == key_column_name:
                continue
            row = {key:entry.custom[key].text for key in entry.custom if key is not key_column_name}
            row['rowname'] = entry.title.text
            key_name = entry.custom[key_column_name].text
            if key_name in string_dict:
                multi_rows.append(key_name)
            string_dict[key_name] = row

        if len(multi_rows) > 0:
            errors = 'read_to_dict -- Column {} contains multiple rows with each of the following values: {}'.format(key_column_name,multi_rows)
            # raise GDError('read_to_dict -- Column {} contains multiple rows with each of the following values: {}'.format(key_column_name,multi_rows))
        else:
            errors = None

        return string_dict,errors
Exemple #18
0
def load_google_spreadsheet(spreadsheet_key, worksheet_name=None):
    """Downloads and exports a Google Spreadsheet in TSV format.

    Returns a string containing the spreadsheet contents in TSV format (e.g.
    for writing out to a file or parsing).

    The first line is assumed to be the spreadsheet header (i.e. containing
    column names), which can optionally be followed by one or more comment
    lines (starting with '#'). Only the first cell of a comment line will be
    parsed (to keep exported spreadsheets consistent with QIIME mapping files'
    comments). The (optional) comments section is then followed by the
    spreadsheet data.

    Some of this code is based on the following websites, as well as the
    gdata.spreadsheet.text_db module:
        http://www.payne.org/index.php/Reading_Google_Spreadsheets_in_Python
        http://stackoverflow.com/a/12031835

    Arguments:
        spreadsheet_key - the key used to identify the spreadsheet (a string).
            Can either be a key or a URL containing the key
        worksheet_name - the name of the worksheet to load data from (a
            string). If not supplied, will use first worksheet in the
            spreadsheet
    """
    spreadsheet_key = _extract_spreadsheet_key_from_url(spreadsheet_key)
    gd_client = SpreadsheetsService()

    try:
        worksheets_feed = gd_client.GetWorksheetsFeed(spreadsheet_key,
                                                      visibility='public',
                                                      projection='basic')
    except gaierror:
        raise GoogleSpreadsheetConnectionError("Could not establish "
                                               "connection with server. Do "
                                               "you have an active Internet "
                                               "connection?")

    if len(worksheets_feed.entry) < 1:
        raise GoogleSpreadsheetError("The Google Spreadsheet with key '%s' "
                                     "does not have any worksheets associated "
                                     "with it." % spreadsheet_key)

    # Find worksheet that will be exported. If a name has not been provided,
    # use the first worksheet.
    worksheet = None
    if worksheet_name is not None:
        for sheet in worksheets_feed.entry:
            if sheet.title.text == worksheet_name:
                worksheet = sheet

        if worksheet is None:
            raise GoogleSpreadsheetError("The worksheet name '%s' could not "
                                         "be found in the Google Spreadsheet "
                                         "with key '%s'." %
                                         (worksheet_name, spreadsheet_key))
    else:
        # Choose the first one.
        worksheet = worksheets_feed.entry[0]

    # Extract the ID of the worksheet.
    worksheet_id = worksheet.id.text.split('/')[-1]

    # Now that we have a spreadsheet key and worksheet ID, we can read the
    # data. First get the headers (first row). We need this in order to grab
    # the rest of the actual data in the correct order (it is returned
    # unordered).
    headers = _get_spreadsheet_headers(gd_client, spreadsheet_key,
                                       worksheet_id)
    if len(headers) < 1:
        raise GoogleSpreadsheetError("Could not load spreadsheet header (it "
                                     "appears to be empty). Is your Google "
                                     "Spreadsheet with key '%s' empty?" %
                                     spreadsheet_key)

    # Loop through the rest of the rows and build up a list of data (in the
    # same row/col order found in the spreadsheet).
    spreadsheet_lines = _export_spreadsheet(gd_client, spreadsheet_key,
                                            worksheet_id, headers)

    out_lines = StringIO()
    tsv_writer = writer(out_lines, delimiter='\t', lineterminator='\n')
    tsv_writer.writerows(spreadsheet_lines)
    return out_lines.getvalue()
class SheetQuerier:
    key = "0AkGO8UqErL6idDhYYjg1ZXlORnRaM3ZhTks4Z3FrYlE"
    worksheets = {"DVD": "movietitle", "Blu-ray": "movietitle"}
    cache = "spreadsheet_cache.p"
    cache_timeout = 60 * 60 * 24 * 7
    prefixes = ["The", "A"]
    whitespace_pattern = re.compile('[^a-zA-Z0-9 ]+', re.UNICODE)

    def __init__(self, force_reload=False):
        try:
            pickle_date, self.data = pickle.load(open(self.cache, "rb"))
            if DEBUG:
                print "Loaded cache successfully"
            if force_reload or time.time() - pickle_date > self.cache_timeout:
                if DEBUG:
                    if force_reload:
                        print "Cache update forced."
                    else:
                        print "Cache too old, reloading."
                self.reload_data()
                pickle.dump((time.time(), self.data), open(self.cache, "wb"))
        except:
            if DEBUG:
                print "Problem loading cache, reloading."
            self.reload_data()
            pickle.dump((time.time(), self.data), open(self.cache, "wb"))

    def reload_data(self):
        self.client = SpreadsheetsService()
        feed = self.client.GetWorksheetsFeed(self.key,
                                             visibility='public',
                                             projection='basic')
        self.data = {}
        for entry in feed.entry:
            if entry.title.text in self.worksheets.keys():
                bad_rows, total_rows = self.process_sheet(
                    entry.id.text.split("/")[-1],
                    self.worksheets[entry.title.text])
                print "Skipped %d / %d rows in sheet \"%s\"" % (
                    bad_rows, total_rows, entry.title.text)
            elif DEBUG:
                print "Skipped sheet \"%s\"" % entry.title.text

    def process_sheet(self,
                      sheet_key,
                      movie_row_key,
                      type_row_key="forcedsubtitletype"):
        if DEBUG:
            print "Document: %s" % self.key
            print "Sheet: %s" % sheet_key

        rows = self.client.GetListFeed(self.key,
                                       sheet_key,
                                       visibility='public',
                                       projection='values').entry
        bad_rows = 0
        for row in rows:
            try:
                self.data[SheetQuerier.clean_title(
                    row.custom[movie_row_key].text.strip(
                    ))] = row.custom[type_row_key].text.strip()
            except:
                bad_rows += 1
        return bad_rows, len(rows)

    def query_exact(self, title):
        query = SheetQuerier.clean_title(title)
        if query in self.data:
            return self.data[query]
        else:
            return False

    @staticmethod
    def clean_title(title):
        # Move prefixes
        for prefix in SheetQuerier.prefixes:
            if title.endswith(prefix):
                title = "%s %s" % (prefix, title[:-1 * len(prefix) - 2])
                break
        # Strip all non alpha-numeric characters
        title = SheetQuerier.whitespace_pattern.sub('', title)

        # Return the lowercase version
        return title.lower()