Beispiel #1
0
def cache_simstring(datasets,
                    verbose=False,
                    ann_modulo=1000,
                    queries_modulo=1000):
    from simstring import reader as simstring_reader

    if verbose:
        print >> stderr, 'Caching SimString:'

        print >> stderr, 'Pre-caching queries...',
        queries_seen = 0

    # For most cases we are better off caching every single query instead of
    # iterating over them, this also makes sure that each query is unique when
    # we finally hit the SimString database
    queries = set()
    for dataset in datasets:
        for document in dataset:
            for sentence in document:
                for annotation in sentence:
                    queries.add(sentence.annotation_text(annotation))
                    if verbose:
                        queries_seen += 1
                        if queries_seen % queries_modulo == 0:
                            print >> stderr, queries_seen, '...',
    if verbose:
        print >> stderr, ('Done! (reduced from {} to {})').format(
            queries_seen, len(queries))

    for db_i, db_path in enumerate(SIMSTRING_DB_PATHS, start=1):
        if verbose:
            print >> stderr, 'Caching for db: {0} ({1}/{2}) ...'.format(
                db_path, db_i, len(SIMSTRING_DB_PATHS)),

        if verbose:
            ann_cnt = 0
        db_reader = None
        try:
            db_reader = simstring_reader(db_path)
            for query in queries:
                query_simstring_db(query, db_path, reader_arg=db_reader)

                if verbose:
                    ann_cnt += 1
                    if ann_cnt % ann_modulo == 0:
                        print >> stderr, ann_cnt, '...',

        finally:
            if db_reader is not None:
                db_reader.close()
        if verbose:
            print >> stderr, 'Done!'
Beispiel #2
0
def _token_reprs(tokens, db_paths, verbose=False):
    from simstring import cosine as simstring_cosine
    from simstring import reader as simstring_reader

    if verbose:
        db_timings = []

    repr_by_token = defaultdict(list)
    for db_path_i, db_path in enumerate(db_paths, start=1):
        if verbose:
            db_timing_start = datetime.utcnow()
            _vprint('Opening DB ({}/{}): {}'.format(db_path_i, len(db_paths),
                db_path))

        reader = simstring_reader(db_path)
        reader.measure = simstring_cosine

        if verbose:
            _vprint('Querying DB...', end='')

        for token_i, token in enumerate(tokens, start=1):
            if verbose and token_i % 1000 == 0:
               _vprint('{}...'.format(token_i), no_tag=True, end='')

            # Fech the threshold to use as a distance
            repr_by_token[token].append(_find_threshold(token.encode('utf-8'),
                reader))

        if verbose:
            _vprint('{}...Done!'.format(token_i), no_tag=True)

            db_timings.append(datetime.utcnow() - db_timing_start)
            if db_path_i != len(db_paths):
                per_db_estimate = (sum(db_timings, timedelta())
                        / len(db_timings))
                completion_estimate = (per_db_estimate
                        * (len(db_paths) - db_path_i))
                _vprint('Estimated time until completion: {}'.format(
                    completion_estimate))

    for token in tokens:
        yield token, repr_by_token[token]
Beispiel #3
0
def query_simstring_db(query, db_path, reader_arg=None):
    from simstring import reader as simstring_reader
    from simstring import cosine as simstring_cosine

    global SIMSTRING_QUERY_CACHE
    global MODIFIED_SIMSTRING_QUERY_CACHE
    if SIMSTRING_QUERY_CACHE is None:
        _load_simstring_cache()

    try:
        cache = SIMSTRING_QUERY_CACHE[db_path]
    except KeyError:
        cache = {}
        SIMSTRING_QUERY_CACHE[db_path] = cache
        MODIFIED_SIMSTRING_QUERY_CACHE = True

    try:
        return cache[query]
    except KeyError:
        MODIFIED_SIMSTRING_QUERY_CACHE = True

    # We have to query this...
    #assert False, 'NOT ALLOWED TO QUERY!'

    if reader_arg is None:
        reader = None
    try:
        if reader_arg is None:
            reader = simstring_reader(db_path)
        else:
            reader = reader_arg
        
        reader.measure = simstring_cosine
        for threshold in (v / 10.0 for v in xrange(10, QUERY_CUT_OFF - 1, -1)):
            reader.threshold = threshold

            # The reader will choke on unicode objects, so encode it
            query_utf8 = query.encode('utf-8')
            response = reader.retrieve(query_utf8)

            if not TSURUOKA_DIST:
                # Only save whether we got a response or not
                if response:
                    response = True
                else:
                    response = False
                tsuruoka_dist = None
            else:
                # Okay, now we are in a pickle, SimString has returned
                # everything sorted by length... Although it had it internally
                # by n-gram. *sigh* We need it by n-gram.

                if response:
                    # Sort the response to prepare a cut-off
                    from lib.ngram import n_gram_ref_cos_cmp, n_gram_gen
                    ref_grams = set(g for g in n_gram_gen(query, n=3,
                        guards=TSURUOKA_GUARDED))

                    # We need Unicode internally at this point
                    response = [s.decode('utf-8') for s in response]
                    response = sorted(response,
                            cmp=lambda a, b: -n_gram_ref_cos_cmp(
                                a, b, ref_grams, guards=TSURUOKA_GUARDED))
                    # Cut-off time!
                    response = response[:RESPONSE_CUT_OFF]

                    if TSURUOKA_NORMALISED:
                        tsuruoka_dist = max(bucket_norm_tsuruoka(query, resp_str)
                                for resp_str in response)
                    else:
                        tsuruoka_dist = min(bucket_tsuruoka(query, resp_str)
                                for resp_str in response)
            if response:
                cache[query] = (threshold, tsuruoka_dist)
                # We can and should bail at this point
                break
        else:
            # We found no results for any threshold
            cache[query] = (None, None)
    finally:
        # Only close if we were not passed the reader
        if reader_arg is None and reader is not None:
            reader.close()
    
    #print cache
    #print SIMSTRING_CACHE_BY_DB[db_path]
    #print SIMSTRING_CACHE_BY_DB
    #if len(cache) > 100:
    #    exit(-1)
    return cache[query]
Beispiel #4
0
def query_simstring_db(query, db_path, reader_arg=None):
    from simstring import reader as simstring_reader
    from simstring import cosine as simstring_cosine

    global SIMSTRING_QUERY_CACHE
    global MODIFIED_SIMSTRING_QUERY_CACHE
    if SIMSTRING_QUERY_CACHE is None:
        _load_simstring_cache()

    try:
        cache = SIMSTRING_QUERY_CACHE[db_path]
    except KeyError:
        cache = {}
        SIMSTRING_QUERY_CACHE[db_path] = cache
        MODIFIED_SIMSTRING_QUERY_CACHE = True

    try:
        return cache[query]
    except KeyError:
        MODIFIED_SIMSTRING_QUERY_CACHE = True

    # We have to query this...
    #assert False, 'NOT ALLOWED TO QUERY!'

    if reader_arg is None:
        reader = None
    try:
        if reader_arg is None:
            reader = simstring_reader(db_path)
        else:
            reader = reader_arg

        reader.measure = simstring_cosine
        for threshold in (v / 10.0 for v in xrange(10, QUERY_CUT_OFF - 1, -1)):
            reader.threshold = threshold

            # The reader will choke on unicode objects, so encode it
            query_utf8 = query.encode('utf-8')
            response = reader.retrieve(query_utf8)

            if not TSURUOKA_DIST:
                # Only save whether we got a response or not
                if response:
                    response = True
                else:
                    response = False
                tsuruoka_dist = None
            else:
                # Okay, now we are in a pickle, SimString has returned
                # everything sorted by length... Although it had it internally
                # by n-gram. *sigh* We need it by n-gram.

                if response:
                    # Sort the response to prepare a cut-off
                    from lib.ngram import n_gram_ref_cos_cmp, n_gram_gen
                    ref_grams = set(g for g in n_gram_gen(
                        query, n=3, guards=TSURUOKA_GUARDED))

                    # We need Unicode internally at this point
                    response = [s.decode('utf-8') for s in response]
                    response = sorted(
                        response,
                        cmp=lambda a, b: -n_gram_ref_cos_cmp(
                            a, b, ref_grams, guards=TSURUOKA_GUARDED))
                    # Cut-off time!
                    response = response[:RESPONSE_CUT_OFF]

                    if TSURUOKA_NORMALISED:
                        tsuruoka_dist = max(
                            bucket_norm_tsuruoka(query, resp_str)
                            for resp_str in response)
                    else:
                        tsuruoka_dist = min(
                            bucket_tsuruoka(query, resp_str)
                            for resp_str in response)
            if response:
                cache[query] = (threshold, tsuruoka_dist)
                # We can and should bail at this point
                break
        else:
            # We found no results for any threshold
            cache[query] = (None, None)
    finally:
        # Only close if we were not passed the reader
        if reader_arg is None and reader is not None:
            reader.close()

    #print cache
    #print SIMSTRING_CACHE_BY_DB[db_path]
    #print SIMSTRING_CACHE_BY_DB
    #if len(cache) > 100:
    #    exit(-1)
    return cache[query]