class TroveSwiftIndex(TroveIndex): """A Trove Index class that uses an in memory dictionary to store the index - for access to data stored in a Swift container""" def __init__(self): self.swifttext = SwiftTextContainer() super(TroveSwiftIndex, self).__init__() def get_document(self, id): """Get a document from the datafile given the document id. Return a Python dictionary with the document properties or None if there is no valid data at this offset""" try: offset, length, datafile = self.get(id) except: return None # local indexer has stored full path to the file, truncate for swift datafile = os.path.basename(datafile) line = self.swifttext.get_by_offset(datafile, offset, length) try: data = json.loads(line) except: data = None return data
def countwords(document): print "COUNT", document docname = document['name'] outfile = "wordcount-" + docname sw = SwiftTextContainer() count = dict() wordcount = dict() n = 0 for offset,line in sw.document_lines(docname): try: doc = json.loads(line.decode('utf-8')) except: sys.stdout.write('!*!') sys.stdout.flush() continue try: year = doc['date'][:4] wc = int(doc['wordCount']) if year in count: count[year] += 1 else: count[year] = 0 if year in wordcount: wordcount[year] += wc else: wordcount[year] = wc n += 1 if n % INTERVAL == 0: write(count, wordcount, outfile) sys.stdout.write("%s|" % n) sys.stdout.flush() except: pass
def __init__(self, datafile, out='index.idx'): """Create a Trove Index containing offsets of each document write it to a file. datafile - the name of the source data file, can be gzipped or plain text outdir - output directory, default 'index' """ self.swifttext = SwiftTextContainer() self.datafile = datafile super(TroveSwiftIndexBuilder, self).__init__(datafile, out)
class TroveSwiftIndexBuilder(TroveIndexBuilder): """Build an index for documents stored in a Swift object store""" def __init__(self, datafile, out='index.idx'): """Create a Trove Index containing offsets of each document write it to a file. datafile - the name of the source data file, can be gzipped or plain text outdir - output directory, default 'index' """ self.swifttext = SwiftTextContainer() self.datafile = datafile super(TroveSwiftIndexBuilder, self).__init__(datafile, out) def add_to_index(self, id, offset, length): """Add this id/offset pair to the index """ # for swift we use the file baseam self.out.write("%s, %d, %d, %s\n" % (id, offset, length, os.path.basename(self.datafile))) def _build_index(self): """Build an index of the documents in the datafile """ for offset, line in self.swifttext.document_lines(self.datafile): try: data = json.loads(line.decode('utf-8')) except: print "Bad line: ", offset, line continue if 'id' in data: id = data['id'] self.add_to_index(id, offset, len(line)) else: print "Bad line: ", line
def __init__(self): self.swifttext = SwiftTextContainer() super(TroveSwiftIndex, self).__init__()
import optparse import sys parser = optparse.OptionParser() parser.add_option("-s", "--swift", dest="swift", action="store_true", default=False, help="read data from a swift container") parser.add_option("-o", "--outdir", dest="outdir", action="store", default='index', help="output directory for index files") (options, args) = parser.parse_args() if not os.path.exists(options.outdir): os.makedirs(options.outdir) if options.swift: container = SwiftTextContainer() for doc in container.documents(): print doc base, ext = os.path.splitext(doc['name']) out = os.path.join(options.outdir, base + ".idx") TroveSwiftIndexBuilder(doc['name'], out=out) else: for doc in args: print doc base, ext = os.path.splitext(os.path.basename(doc)) out = os.path.join(options.outdir, base + ".idx") TroveIndexBuilder(doc, out=out)
count[year] += 1 else: count[year] = 0 if year in wordcount: wordcount[year] += wc else: wordcount[year] = wc n += 1 if n % INTERVAL == 0: write(count, wordcount, outfile) sys.stdout.write("%s|" % n) sys.stdout.flush() except: pass if __name__=='__main__': config = readconfig() INTERVAL = int(config.get('default', 'WC_INTERVAL')) processes = int(config.get('default', 'PROCESSES')) sw = SwiftTextContainer() pool = Pool(processes) pool.map(countwords, sw.documents())