コード例 #1
0
ファイル: multiproc.py プロジェクト: oier/Yaki
    def finish(self, termswriter, doccount, lengthfile):
        _fieldlength_totals = self._fieldlength_totals
        if not self.tasks:
            return

        jobqueue = self.jobqueue
        rqueue = self.resultqueue

        for task in self.tasks:
            jobqueue.put((None, doccount))

        for task in self.tasks:
            task.join()

        runs = []
        lenfilenames = []
        for task in self.tasks:
            taskruns, flentotals, flenmaxes, lenfilename = rqueue.get()
            runs.extend(taskruns)
            lenfilenames.append(lenfilename)
            for fieldnum, total in flentotals.iteritems():
                _fieldlength_totals[fieldnum] += total
            for fieldnum, length in flenmaxes.iteritems():
                if length > self._fieldlength_maxes.get(fieldnum, 0):
                    self._fieldlength_maxes[fieldnum] = length

        jobqueue.close()
        rqueue.close()

        lw = LengthWriter(lengthfile, doccount)
        for lenfilename in lenfilenames:
            sublengths = LengthReader(StructFile(open(lenfilename, "rb")),
                                      doccount)
            lw.add_all(sublengths)
            os.remove(lenfilename)
        lw.close()
        lengths = lw.reader()

        #        if len(runs) >= self.procs * 2:
        #            pool = Pool(self.procs)
        #            tempname = lambda: tempfile.mktemp(suffix=".run", dir=self.dir)
        #            while len(runs) >= self.procs * 2:
        #                runs2 = [(runs[i:i+4], tempname())
        #                         for i in xrange(0, len(runs), 4)]
        #                if len(runs) % 4:
        #                    last = runs2.pop()[0]
        #                    runs2[-1][0].extend(last)
        #                runs = pool.map(merge_runs, runs2)
        #            pool.close()

        iterator = imerge(
            [read_run(runname, count) for runname, count in runs])
        total = sum(count for runname, count in runs)
        termswriter.add_iter(iterator, lengths.get)
        for runname, count in runs:
            os.remove(runname)

        self.cleanup()
コード例 #2
0
    def finish(self, termswriter, doccount, lengthfile):
        self._write_lengths(lengthfile, doccount)
        lengths = LengthReader(None, doccount, self.length_arrays)

        if not self._flushed:
            gen = self.readback_buffer()
        else:
            if self.postbuf:
                self.flush()
            gen = self.readback()

        termswriter.add_iter(gen, lengths.get)
コード例 #3
0
 def finish(self, doccount, lengthfile, termtable, postingwriter):
     _fieldlength_totals = self._fieldlength_totals
     if not self.tasks:
         return
     
     pqueue = self.postingqueue
     rqueue = self.resultsqueue
     
     for _ in xrange(self.procs):
         pqueue.put((-1, doccount))
     
     #print "Joining..."
     t = now()
     for task in self.tasks:
         task.join()
     #print "Join:", now() - t
     
     #print "Getting results..."
     t = now()
     runs = []
     lenfilenames = []
     for task in self.tasks:
         taskruns, flentotals, flenmaxes, lenfilename = rqueue.get()
         runs.extend(taskruns)
         lenfilenames.append(lenfilename)
         for fieldnum, total in flentotals.iteritems():
             _fieldlength_totals[fieldnum] += total
         for fieldnum, length in flenmaxes.iteritems():
             if length > self._fieldlength_maxes.get(fieldnum, 0):
                 self._fieldlength_maxes[fieldnum] = length
     #print "Results:", now() - t
     
     #print "Writing lengths..."
     t = now()
     lw = LengthWriter(lengthfile, doccount)
     for lenfilename in lenfilenames:
         sublengths = LengthReader(StructFile(open(lenfilename, "rb")), doccount)
         lw.add_all(sublengths)
         os.remove(lenfilename)
     lw.close()
     lengths = lw.reader()
     #print "Lengths:", now() - t
     
     t = now()
     iterator = imerge([read_run(runname, count) for runname, count in runs])
     total = sum(count for runname, count in runs)
     write_postings(self.schema, termtable, lengths, postingwriter, iterator)
     for runname, count in runs:
         os.remove(runname)
     #print "Merge:", now() - t
     
     self.cleanup()
コード例 #4
0
    def __init__(self, storage, schema, segment):
        self.storage = storage
        self.schema = schema
        self.segment = segment

        if hasattr(self.segment, "uuid"):
            self.uuid_string = str(self.segment.uuid)
        else:
            import uuid
            self.uuid_string = str(uuid.uuid4())

        # Term index
        tf = storage.open_file(segment.termsindex_filename)
        self.termsindex = TermIndexReader(tf)

        # Term vector index, and vector postings: lazy load
        self.vectorindex = None
        self.vpostfile = None

        # Stored fields file
        sf = storage.open_file(segment.storedfields_filename, mapped=False)
        self.storedfields = StoredFieldReader(sf)

        # Field length file
        self.fieldlengths = None
        if self.schema.has_scorable_fields():
            flf = storage.open_file(segment.fieldlengths_filename)
            self.fieldlengths = LengthReader(flf, segment.doc_count_all())

        # Copy info from underlying segment
        self._has_deletions = segment.has_deletions()
        self._doc_count = segment.doc_count()

        # Postings file
        self.postfile = self.storage.open_file(segment.termposts_filename,
                                               mapped=False)

        # Dawg file
        self.dawg = None
        if any(field.spelling for field in self.schema):
            fname = segment.dawg_filename
            if self.storage.file_exists(fname):
                dawgfile = self.storage.open_file(fname, mapped=False)
                self.dawg = DiskNode.load(dawgfile, expand=False)

        self.dc = segment.doc_count_all()
        assert self.dc == self.storedfields.length

        self.set_caching_policy()

        self.is_closed = False
        self._sync_lock = Lock()
コード例 #5
0
    def finish(self, termswriter, doccount, lengthfile):
        from itertools import izip

        pbuf = self.postbuf
        self._write_lengths(lengthfile, doccount)
        lengths = LengthReader(None, doccount, self.length_arrays)

        def gen():
            for term in sorted(pbuf):
                fieldname, text = term
                for docnum, weight, valuestring in izip(*pbuf[term]):
                    yield (fieldname, text, docnum, weight, valuestring)

        termswriter.add_iter(gen(), lengths.get)
コード例 #6
0
    def finish(self, termswriter, doccount, lengthfile):
        self._write_lengths(lengthfile, doccount)
        lengths = LengthReader(None, doccount, self.length_arrays)

        if self.postings or self.runs:
            if self.postings and len(self.runs) == 0:
                self.postings.sort()
                postiter = iter(self.postings)
            elif not self.postings and not self.runs:
                postiter = iter([])
            else:
                self.dump_run()
                postiter = imerge([read_run(runname, count)
                                   for runname, count in self.runs])

            termswriter.add_iter(postiter, lengths.get)
        self.cleanup()
コード例 #7
0
ファイル: filereading.py プロジェクト: bopopescu/mp100
    def __init__(self, storage, schema, segment):
        self.storage = storage
        self.schema = schema
        self.segment = segment

        if hasattr(self.segment, "uuid"):
            self.uuid_string = str(self.segment.uuid)
        else:
            import uuid
            self.uuid_string = str(uuid.uuid4())

        # Term index
        tf = storage.open_file(segment.termsindex_filename)
        self.termsindex = TermIndexReader(tf)

        # Term postings file, vector index, and vector postings: lazy load
        self.postfile = None
        self.vectorindex = None
        self.vpostfile = None

        # Stored fields file
        sf = storage.open_file(segment.storedfields_filename, mapped=False)
        self.storedfields = StoredFieldReader(sf)

        # Field length file
        self.fieldlengths = None
        if self.schema.has_scorable_fields():
            flf = storage.open_file(segment.fieldlengths_filename)
            self.fieldlengths = LengthReader(flf, segment.doc_count_all())

        # Copy methods from underlying segment
        self.has_deletions = segment.has_deletions
        self.is_deleted = segment.is_deleted
        self.doc_count = segment.doc_count

        # Postings file
        self.postfile = self.storage.open_file(segment.termposts_filename,
                                               mapped=False)

        self.dc = segment.doc_count_all()
        assert self.dc == self.storedfields.length

        self.set_caching_policy()

        self.is_closed = False
        self._sync_lock = Lock()
コード例 #8
0
ファイル: pools.py プロジェクト: dongshige/wikidpad
    def finish(self, doccount, lengthfile, termtable, postingwriter):
        self._write_lengths(lengthfile, doccount)
        lengths = LengthReader(None, doccount, self.length_arrays)

        if self.postings or self.runs:
            if self.postings and len(self.runs) == 0:
                self.postings.sort()
                postiter = iter(self.postings)
            elif not self.postings and not self.runs:
                postiter = iter([])
            else:
                self.dump_run()
                postiter = imerge(
                    [read_run(runname, count) for runname, count in self.runs])

            write_postings(self.schema, termtable, lengths, postingwriter,
                           postiter)
        self.cleanup()
コード例 #9
0
 def finish(self, termswriter, doccount, lengthfile):
     self._write_lengths(lengthfile, doccount)
     lengths = LengthReader(None, doccount, self.length_arrays)
     self.postbuf.sort()
     termswriter.add_iter(self.postbuf, lengths.get)