LODStats is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with LODStats. If not, see <http://www.gnu.org/licenses/>. """ import bitarray from hashlib import md5 from LimitedSizeDict import LimitedSizeDict # FIXME: does it help to build some small "md5-cache" for the last 1, 2, 3 strings?! # subjects distinct_subjects = LimitedSizeDict(size_limit=300000) # FIXME: make limit configurable # 0: entities, 1: typed subjects, 2: labeled subjects def query_distinct_subject(s, num_id): if len(s) > 16: s_hash = md5(s).digest() else: s_hash = s if distinct_subjects.has_key(s_hash): return distinct_subjects[s_hash][num_id] else: return False def set_distinct_subject(s, num_id): if len(s) > 16: s_hash = md5(s).digest()