def process_feed(feed):
  feed = client.GetResources(feed)
  print '\n'
  if not feed.entry:
    print 'No entries in feed.\n'
  for entry in feed.entry:
    if entry.get_resource_type() == 'document':
      print entry.title.text
      filename = files + entry.id.text.split('/')[5] + '.html'
      print filename
      if r.sadd('gdocs-wordcloud:filenames', filename):
        client.download_resource(entry, filename) # is this synchronous?
Example #2
0
def convertWithGDocsv3(File):
    """ Upload File and download html representation
        using Google Documents API v3
    """
    # Create and Authorize OAuth client
    client = CreateClient()
    #print "GDocsv3: client created"

    # Get file_type and encoding of uploaded file
    # i.e: file_type = 'text/plain', encoding = None
    (file_type, encoding) = mimetypes.guess_type(File.file.path)

    # If mimetype cannot be guessed
    # Check against known issues, then
    # finally, Raise Exception
    # Extract file extension and compare it to EXT_TO_MIME dict

    fileName, fileExtension = os.path.splitext(File.file.path)

    if file_type == None:

        if fileExtension.strip().lower() in EXT_TO_MIME:
            file_type = EXT_TO_MIME[fileExtension.strip().lower()]
        # If boy mimetypes.guess_type and EXT_TO_MIME fail to cover
        # file, return error
        else:
            raise Exception('Unknown file type')

    # Encapsulate File in Google's MediaSource Object
    media = gdata.data.MediaSource()
    media.SetFileHandle(File.file.path, file_type)
    #print "GDocsv3: MediaSource created"
    # Create a Resource to connect MediaSource to
    if File.title:
        file_title = File.title
    else:
        # If the django File obj has no title
        # Use the filename. This only affects the document
        # name in Karma Note's Google Docs account
        file_title = fileName.rsplit("/", 1)[1]
    doc = gdata.docs.data.Resource(type='document', title=file_title)
    #print "GDocsv3: resource created"
    # if pdf, append OCR=true to uri
    if file_type == 'application/pdf':
        create_uri = gdata.docs.client.RESOURCE_UPLOAD_URI + '?ocr=true'
    else:
        create_uri = gdata.docs.client.RESOURCE_UPLOAD_URI

    # Upload document and retrieve representation
    doc = client.CreateResource(entry=doc, create_uri=create_uri, media=media)
    #print "GDocsv3: resource sent"
    #print "file_type: " + str(file_type)

    # Create a dictionary for extra Google query variables
    #query_args = {'exportFormat': 'text/html'}
    # Download html representation of document
    #client.download_resource(entry=doc, file_path=File.file.path + '.html', extra_params=query_args)
    # exportFormat default is html. Sending exportFormat: text/html now produces 404s
    client.download_resource(entry=doc, file_path=File.file.path + '.html')
    #print "GDocsv3: resource downloaded"
    f = open(str(File.file.path) + '.html')
    File.html = f.read()
    File.save()
    f.close()

    '''