Python SwiftTextContainer Examples

Programming Language: Python

Namespace/Package Name: swifttext

Examples at hotexamples.com: 7

Python SwiftTextContainer - 7 examples found. These are the top rated real world Python examples of swifttext.SwiftTextContainer extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

document_lines(2)

documents(2)

get_by_offset(1)

Example #1

Show file

File: index.py Project: stevecassidy/trovenames

class TroveSwiftIndex(TroveIndex):
    """A Trove Index class that uses an in memory dictionary to store
    the index - for access to data stored in a Swift container"""

    def __init__(self):

        self.swifttext = SwiftTextContainer()

        super(TroveSwiftIndex, self).__init__()



    def get_document(self, id):
        """Get a document from the datafile given
        the document id. Return a Python dictionary
        with the document properties or None if there
        is no valid data at this offset"""

        try:
            offset, length, datafile = self.get(id)
        except:
            return None

        # local indexer has stored full path to the file, truncate for swift
        datafile = os.path.basename(datafile)
        line = self.swifttext.get_by_offset(datafile, offset, length)

        try:
            data = json.loads(line)
        except:
            data = None

        return data

Example #2

Show file

File: wordcount.py Project: stevecassidy/trovenames

def countwords(document):

    print "COUNT", document


    docname = document['name']

    outfile = "wordcount-" + docname

    sw = SwiftTextContainer()

    count = dict()
    wordcount = dict()

    n = 0
    for offset,line in sw.document_lines(docname):

        try:
            doc = json.loads(line.decode('utf-8'))
        except:
            sys.stdout.write('!*!')
            sys.stdout.flush()
            continue

        try:
            year = doc['date'][:4]
            wc = int(doc['wordCount'])
            if year in count:
                count[year] += 1
            else:
                count[year] = 0

            if year in wordcount:
                wordcount[year] += wc
            else:
                wordcount[year] = wc

            n += 1

            if n % INTERVAL == 0:
                write(count, wordcount, outfile)
                sys.stdout.write("%s|" % n)
                sys.stdout.flush()
        except:
            pass

Example #3

Show file

File: index.py Project: stevecassidy/trovenames

    def __init__(self, datafile, out='index.idx'):
        """Create a Trove Index containing offsets of each document write it to a file.

        datafile - the name of the source data file, can be gzipped or plain text
        outdir - output directory, default 'index'
        """

        self.swifttext = SwiftTextContainer()
        self.datafile = datafile

        super(TroveSwiftIndexBuilder, self).__init__(datafile, out)

Example #4

Show file

File: index.py Project: stevecassidy/trovenames

class TroveSwiftIndexBuilder(TroveIndexBuilder):
    """Build an index for documents stored in a Swift object store"""


    def __init__(self, datafile, out='index.idx'):
        """Create a Trove Index containing offsets of each document write it to a file.

        datafile - the name of the source data file, can be gzipped or plain text
        outdir - output directory, default 'index'
        """

        self.swifttext = SwiftTextContainer()
        self.datafile = datafile

        super(TroveSwiftIndexBuilder, self).__init__(datafile, out)


    def add_to_index(self, id, offset, length):
        """Add this id/offset pair to the index
        """

        # for swift we use the file baseam
        self.out.write("%s, %d, %d, %s\n" % (id, offset, length, os.path.basename(self.datafile)))


    def _build_index(self):
        """Build an index of the documents in the datafile
        """

        for offset, line in self.swifttext.document_lines(self.datafile):

            try:
                data = json.loads(line.decode('utf-8'))
            except:
                print "Bad line: ", offset, line
                continue

            if 'id' in data:
                id = data['id']
                self.add_to_index(id, offset, len(line))
            else:
                print "Bad line: ", line

Example #5

Show file

File: index.py Project: stevecassidy/trovenames

    def __init__(self):

        self.swifttext = SwiftTextContainer()

        super(TroveSwiftIndex, self).__init__()

Example #6

Show file

File: index.py Project: stevecassidy/trovenames

    import optparse
    import sys

    parser = optparse.OptionParser()
    parser.add_option("-s", "--swift", dest="swift", action="store_true", default=False,
                      help="read data from a swift container")
    parser.add_option("-o", "--outdir", dest="outdir", action="store", default='index',
                      help="output directory for index files")

    (options, args) = parser.parse_args()


    if not os.path.exists(options.outdir):
        os.makedirs(options.outdir)

    if options.swift:
        container = SwiftTextContainer()

        for doc in container.documents():
            print doc
            base, ext = os.path.splitext(doc['name'])
            out = os.path.join(options.outdir, base + ".idx")
            TroveSwiftIndexBuilder(doc['name'], out=out)
    else:
        for doc in args:
            print doc
            base, ext = os.path.splitext(os.path.basename(doc))
            out = os.path.join(options.outdir, base + ".idx")
            TroveIndexBuilder(doc, out=out)

Example #7

Show file

File: wordcount.py Project: stevecassidy/trovenames

                count[year] += 1
            else:
                count[year] = 0

            if year in wordcount:
                wordcount[year] += wc
            else:
                wordcount[year] = wc

            n += 1

            if n % INTERVAL == 0:
                write(count, wordcount, outfile)
                sys.stdout.write("%s|" % n)
                sys.stdout.flush()
        except:
            pass


if __name__=='__main__':

    config = readconfig()
    INTERVAL = int(config.get('default', 'WC_INTERVAL'))
    processes = int(config.get('default', 'PROCESSES'))

    sw = SwiftTextContainer()

    pool = Pool(processes)

    pool.map(countwords, sw.documents())