accesskey = "" secretkey = "" conn = S3Connection(accesskey, secretkey) bucketname = "dataiap.mit.edu.ap" try: bucket = conn.create_bucket(bucketname) except: print "could not create bucket ", bucketname bucket = conn.get_bucket(bucketname) def upload(category, fname, root): if fname.startswith("urn"): key = Key(bucket) key.key = "%s_%s" % (category, fname) key.set_contents_from_filename("%s/%s" % (root, fname)) root = os.path.abspath(sys.argv[1]) walk_news(root, upload) # retrieve ALL articles in World bucket.get_all_keys(prefix="World") conn.close() # return self._get_all([('CommonPrefixes', Prefix)], # '', None, {})
that we have no information about this documents' categories id -- a unique ID for the document (any kind of JSON-able value should work). If not specified, we'll auto-generate one. """ text = unicode(text, errors='ignore') cats = dict((unicode(cat), bool(is_in_cat)) for cat, is_in_cat in (cats or {}).iteritems()) return JSONValueProtocol.write( None, {'document': text, 'cats': cats, 'docid': id, 'type' : 'document'}) + '\n' root = os.path.abspath(sys.argv[1]) outroot = os.path.abspath(sys.argv[2]) def encode(category, fname, root): global outroot try: os.mkdir(os.path.join(outroot, category)) except: pass with file(os.path.join(root, fname), 'r') as f: with file(os.path.join(outroot, category, fname), 'w') as outf: outf.write(encode_document(f.read(), {category:1}, fname)) walk_news(root, encode)
""" text = unicode(text, errors='ignore') cats = dict((unicode(cat), bool(is_in_cat)) for cat, is_in_cat in (cats or {}).iteritems()) return JSONValueProtocol.write(None, { 'document': text, 'cats': cats, 'docid': id, 'type': 'document' }) + '\n' root = os.path.abspath(sys.argv[1]) outroot = os.path.abspath(sys.argv[2]) def encode(category, fname, root): global outroot try: os.mkdir(os.path.join(outroot, category)) except: pass with file(os.path.join(root, fname), 'r') as f: with file(os.path.join(outroot, category, fname), 'w') as outf: outf.write(encode_document(f.read(), {category: 1}, fname)) walk_news(root, encode)
from boto.s3.connection import S3Connection from boto.s3.key import Key accesskey = '' secretkey = '' conn = S3Connection(accesskey, secretkey) bucketname = 'dataiap.mit.edu.ap' try: bucket = conn.create_bucket(bucketname) except: print "could not create bucket ", bucketname bucket = conn.get_bucket(bucketname) def upload(category, fname, root): if fname.startswith('urn'): key = Key(bucket) key.key = '%s_%s' % (category, fname) key.set_contents_from_filename('%s/%s' % (root, fname)) root = os.path.abspath(sys.argv[1]) walk_news(root, upload) # retrieve ALL articles in World bucket.get_all_keys(prefix='World') conn.close() # return self._get_all([('CommonPrefixes', Prefix)], # '', None, {})