def process_feed(feed): feed = client.GetResources(feed) print '\n' if not feed.entry: print 'No entries in feed.\n' for entry in feed.entry: if entry.get_resource_type() == 'document': print entry.title.text filename = files + entry.id.text.split('/')[5] + '.html' print filename if r.sadd('gdocs-wordcloud:filenames', filename): client.download_resource(entry, filename) # is this synchronous?
def convertWithGDocsv3(File): """ Upload File and download html representation using Google Documents API v3 """ # Create and Authorize OAuth client client = CreateClient() #print "GDocsv3: client created" # Get file_type and encoding of uploaded file # i.e: file_type = 'text/plain', encoding = None (file_type, encoding) = mimetypes.guess_type(File.file.path) # If mimetype cannot be guessed # Check against known issues, then # finally, Raise Exception # Extract file extension and compare it to EXT_TO_MIME dict fileName, fileExtension = os.path.splitext(File.file.path) if file_type == None: if fileExtension.strip().lower() in EXT_TO_MIME: file_type = EXT_TO_MIME[fileExtension.strip().lower()] # If boy mimetypes.guess_type and EXT_TO_MIME fail to cover # file, return error else: raise Exception('Unknown file type') # Encapsulate File in Google's MediaSource Object media = gdata.data.MediaSource() media.SetFileHandle(File.file.path, file_type) #print "GDocsv3: MediaSource created" # Create a Resource to connect MediaSource to if File.title: file_title = File.title else: # If the django File obj has no title # Use the filename. This only affects the document # name in Karma Note's Google Docs account file_title = fileName.rsplit("/", 1)[1] doc = gdata.docs.data.Resource(type='document', title=file_title) #print "GDocsv3: resource created" # if pdf, append OCR=true to uri if file_type == 'application/pdf': create_uri = gdata.docs.client.RESOURCE_UPLOAD_URI + '?ocr=true' else: create_uri = gdata.docs.client.RESOURCE_UPLOAD_URI # Upload document and retrieve representation doc = client.CreateResource(entry=doc, create_uri=create_uri, media=media) #print "GDocsv3: resource sent" #print "file_type: " + str(file_type) # Create a dictionary for extra Google query variables #query_args = {'exportFormat': 'text/html'} # Download html representation of document #client.download_resource(entry=doc, file_path=File.file.path + '.html', extra_params=query_args) # exportFormat default is html. Sending exportFormat: text/html now produces 404s client.download_resource(entry=doc, file_path=File.file.path + '.html') #print "GDocsv3: resource downloaded" f = open(str(File.file.path) + '.html') File.html = f.read() File.save() f.close() '''