Example #1
0
def countwords(document):

    print "COUNT", document


    docname = document['name']

    outfile = "wordcount-" + docname

    sw = SwiftTextContainer()

    count = dict()
    wordcount = dict()

    n = 0
    for offset,line in sw.document_lines(docname):

        try:
            doc = json.loads(line.decode('utf-8'))
        except:
            sys.stdout.write('!*!')
            sys.stdout.flush()
            continue

        try:
            year = doc['date'][:4]
            wc = int(doc['wordCount'])
            if year in count:
                count[year] += 1
            else:
                count[year] = 0

            if year in wordcount:
                wordcount[year] += wc
            else:
                wordcount[year] = wc

            n += 1

            if n % INTERVAL == 0:
                write(count, wordcount, outfile)
                sys.stdout.write("%s|" % n)
                sys.stdout.flush()
        except:
            pass
Example #2
0
class TroveSwiftIndexBuilder(TroveIndexBuilder):
    """Build an index for documents stored in a Swift object store"""


    def __init__(self, datafile, out='index.idx'):
        """Create a Trove Index containing offsets of each document write it to a file.

        datafile - the name of the source data file, can be gzipped or plain text
        outdir - output directory, default 'index'
        """

        self.swifttext = SwiftTextContainer()
        self.datafile = datafile

        super(TroveSwiftIndexBuilder, self).__init__(datafile, out)


    def add_to_index(self, id, offset, length):
        """Add this id/offset pair to the index
        """

        # for swift we use the file baseam
        self.out.write("%s, %d, %d, %s\n" % (id, offset, length, os.path.basename(self.datafile)))


    def _build_index(self):
        """Build an index of the documents in the datafile
        """

        for offset, line in self.swifttext.document_lines(self.datafile):

            try:
                data = json.loads(line.decode('utf-8'))
            except:
                print "Bad line: ", offset, line
                continue

            if 'id' in data:
                id = data['id']
                self.add_to_index(id, offset, len(line))
            else:
                print "Bad line: ", line