Ejemplo n.º 1
0
def ssdb_build(strs, dbname, ngram_length=DEFAULT_NGRAM_LENGTH, include_marks=DEFAULT_INCLUDE_MARKS):
    """
    Given a list of strings, a DB name, and simstring options, builds
    a simstring DB for the strings.
    """
    try:
        import simstring
    except ImportError:
        Messager.error(SIMSTRING_MISSING_ERROR, duration=-1)
        raise NoSimStringError

    dbfn = __ssdb_path(dbname)
    try:
        # only library defaults (n=3, no marks) supported just now (TODO)
        assert ngram_length == 3, "Error: unsupported n-gram length"
        assert include_marks == False, "Error: begin/end marks not supported"
        db = simstring.writer(dbfn)
        for s in strs:
            db.insert(s)
        db.close()
    except:
        print >> sys.stderr, "Error building simstring DB"
        raise

    return dbfn
Ejemplo n.º 2
0
def write_simstring(dkt):
    dbpath='simstring_law/law.db'
    db=simstring.writer(dbpath,3,False,True)
    for k in dkt:
        if isinstance(k,unicode):
            k=k.encode('utf-8')
        db.insert(k)
Ejemplo n.º 3
0
def ssdb_build(strs,
               dbname,
               ngram_length=DEFAULT_NGRAM_LENGTH,
               include_marks=DEFAULT_INCLUDE_MARKS):
    '''
    Given a list of strings, a DB name, and simstring options, builds
    a simstring DB for the strings.
    '''
    try:
        import simstring
    except ImportError:
        Messager.error(SIMSTRING_MISSING_ERROR, duration=-1)
        raise NoSimStringError

    dbfn = __ssdb_path(dbname)
    try:
        # only library defaults (n=3, no marks) supported just now (TODO)
        assert ngram_length == 3, "Error: unsupported n-gram length"
        assert include_marks == False, "Error: begin/end marks not supported"
        db = simstring.writer(dbfn)
        for s in strs:
            db.insert(s)
        db.close()
    except:
        print >> sys.stderr, "Error building simstring DB"
        raise

    return dbfn
    def __init__(self,
                 dbfn,
                 ngram_length=SimstringBase.DEFAULT_NGRAM_LENGTH,
                 include_marks=SimstringBase.DEFAULT_INCLUDE_MARKS,
                 threshold=SimstringBase.DEFAULT_THRESHOLD,
                 similarity_measure=SimstringBase.DEFAULT_SIMILARITY_MEASURE,
                 unicode=SimstringBase.DEFAULT_UNICODE,
                 build=False):

        assert include_marks == False, "Error: begin/end marks not supported"
        assert ngram_length == 3, "Error: unsupported n-gram length"

        super().__init__(dbfn,
                         ngram_length=ngram_length,
                         include_marks=include_marks,
                         threshold=threshold,
                         similarity_measure=similarity_measure,
                         unicode=unicode,
                         build=build)

        if build:
            self.db = simstring.writer(self.dbfn)
        else:
            self.db = simstring.reader(self.dbfn)

        self.db.measure = SIMILARITY_MEASURES[similarity_measure]
        self.db.threshold = threshold
Ejemplo n.º 5
0
def ssdb_build(strs,
               dbname,
               ngram_length=DEFAULT_NGRAM_LENGTH,
               include_marks=DEFAULT_INCLUDE_MARKS):
    """Given a list of strings, a DB name, and simstring options, builds a
    simstring DB for the strings."""
    __import_simstring()
    dbfn = __ssdb_path(dbname)
    try:
        # only library defaults (n=3, no marks) supported just now (TODO)
        assert include_marks == False, "Error: begin/end marks not supported"
        if SIMSTRING_BINARY:
            assert ngram_length == 3, "Error: unsupported n-gram length"
            db = simstring.writer(dbfn)
            for s in strs:
                db.insert(s)
            db.close()
        else:
            fx = CharacterNgramFeatureExtractor(DEFAULT_NGRAM_LENGTH)
            db = SQLite3Database(fx)
            db.use(dbfn)
            for s in strs:
                db.add(s)

    except BaseException:
        print >> sys.stderr, "Error building simstring DB"
        raise

    return dbfn
Ejemplo n.º 6
0
 def __init__(self, words, measure=3, n=3, be=True, unicode=True, file="sample.db"):
     self.n = n
     subprocess.check_output("mkdir -p db", shell=True)
     db = simstring.writer(f'./db/{file}', n, be, unicode)
     db.measure = measure
     for w in words:
         db.insert(w)
     db.close()
     db = simstring.reader(f"./db/{file}")
     db.measure = measure
     self.db = db
Ejemplo n.º 7
0
def load_data(csv_file, db_file):
    db = simstring.writer(db_file, 3, False, True)
    with open(csv_file, "rb") as f:
        csv_reader = csv.reader(f, delimiter=',')
        for row in csv_reader:
            word = row[0].lower()
            str_word = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore')
            try:
                db.insert(str_word);
            except UnicodeEncodeError as e:
                print word
                pass
    db.close()
Ejemplo n.º 8
0
    def __init__(self, path):

        if not (os.path.exists(path)) or not (os.path.isdir(path)):
            err_msg = (
                '"{}" does not exists or it is not a directory.').format(path)
            raise IOError(err_msg)
        else:
            try:
                os.makedirs(path)
            except OSError:
                pass

        self.db = simstring.writer(
            prepare_string_for_db_input(
                os.path.join(path, 'umls-terms.simstring')), 3, False, True)
Ejemplo n.º 9
0
def create_dbs():
    """
    Reads in the files specified in the lists specified in fe_settings and 
    creates simstring databases.
    """
    for name, fnames in [('people', fe_settings.people),
                         ('places', fe_settings.places),
                         ('departments', fe_settings.departments),
                         ('universities', fe_settings.universities)]:
        out_dbname = os.path.join(fe_settings.simstringdb_dir, name + '.db')
        # Enable creating the database in unicode mode.
        group_db = simstring.writer(out_dbname, 3, False, True)
        for fname in fnames:
            fname = os.path.join(fe_settings.lexicon_dir, fname)
            with open(fname, 'r') as file:
                for line in file:
                    group_db.insert(line.strip())
        group_db.close()
        print 'Wrote: ', out_dbname
Ejemplo n.º 10
0
def build_simstring_db(strs, name):
    """
    Given a collection of strings and a DB name, builds a simstring
    database for the strings. Returns the name under which the DB is
    stored, which is based on but not identical to the given name.
    """

    try:
        # include pid to assure that there are no clashes.
        dbfn = os.path.join(DB_BASE_DIRECTORY, name+"."+str(os.getpid())+".db")
        db = simstring.writer(dbfn)
        for s in strs:
            db.insert(s)
        db.close()
    except:
        print >> sys.stderr, "Error building simstring DB"
        raise

    return dbfn
Ejemplo n.º 11
0
def create_simstring_databases():
    """
    Create the simstring databases using input files in a directory
    :return:
    """
    ood_path = ".." + get_dir_separator() + "Data" + get_dir_separator(
    ) + "dicts"

    for dicts_file in os.listdir(ood_path):

        file_name = dicts_file.split(".")[0]

        if len(file_name.strip()) > 0:
            simstring_db = simstring.writer(file_name + '.db')

            for dict_word in open(ood_path + get_dir_separator() + dicts_file,
                                  'r').readlines():
                simstring_db.insert(dict_word.strip())

            simstring_db.close()
Ejemplo n.º 12
0
 def __init__(self,
              directory,
              filename,
              measure=simstring.overlap,
              threshold=0.65,
              mode='write'):
     if not (filename.endswith('.db') and os.path.isdir(directory)):
         raise ValueError(
             "Incorrect file format for Database. Database must end with .db"
         )
     else:
         self.writer = None
         self.reader = None
         if mode == 'write':
             self.writer = simstring.writer(
                 os.path.join(directory, filename))
         else:
             self.reader = simstring.reader(
                 os.path.join(directory, filename))
             self.reader.measure = measure
             self.reader.threshold = threshold
Ejemplo n.º 13
0
def ssdb_build(strs, dbname, ngram_length=DEFAULT_NGRAM_LENGTH,
               include_marks=DEFAULT_INCLUDE_MARKS):
    '''
    Given a list of strings, a DB name, and simstring options, builds
    a simstring DB for the strings.
    '''
    __import_simstring()

    dbfn = __ssdb_filename(dbname)
    try:
        # only library defaults (n=3, no marks) supported just now (TODO)
        assert ngram_length == 3, "Error: unsupported n-gram length"
        assert include_marks == False, "Error: begin/end marks not supported"
        db = simstring.writer(dbfn)
        for s in strs:
            db.insert(s)
        db.close()
    except:
        print >> sys.stderr, "Error building simstring DB"
        raise

    return dbfn
Ejemplo n.º 14
0
def main(argv):
    arg = argparser().parse_args(argv[1:])

    # only simstring library default supported at the moment (TODO)
    assert DEFAULT_NGRAM_LENGTH == 3, "Error: unsupported n-gram length"
    assert DEFAULT_INCLUDE_MARKS == False, "Error: begin/end marks not supported"

    infn = arg.file

    if arg.database is None:
        # default database file name
        bn = splitext(basename(infn))[0]
        sqldbfn = sqldb_filename(bn)
        ssdbfn = ssdb_filename(bn)
    else:
        sqldbfn = arg.database + '.' + SQL_DB_FILENAME_EXTENSION
        ssdbfn = arg.database + '.' + SS_DB_FILENAME_EXTENSION

    if arg.verbose:
        print("Storing SQL DB as %s and" % sqldbfn, file=sys.stderr)
        print("  simstring DB as %s" % ssdbfn, file=sys.stderr)
    start_time = datetime.now()

    import_count, duplicate_count, error_count, simstring_count = 0, 0, 0, 0

    with codecs.open(infn, 'rU', encoding=arg.encoding) as inf:

        # create SQL DB
        try:
            connection = sqlite.connect(sqldbfn)
        except sqlite.OperationalError as e:
            print("Error connecting to DB %s:" % sqldbfn, e, file=sys.stderr)
            return 1
        cursor = connection.cursor()

        # create SQL tables
        if arg.verbose:
            print("Creating tables ...", end=' ', file=sys.stderr)

        for command in CREATE_TABLE_COMMANDS:
            try:
                cursor.execute(command)
            except sqlite.OperationalError as e:
                print("Error creating %s:" % sqldbfn, e, "(DB exists?)", file=sys.stderr)
                return 1

        # import data
        if arg.verbose:
            print("done.", file=sys.stderr)
            print("Importing data ...", end=' ', file=sys.stderr)

        next_eid = 1
        label_id = {}
        next_lid = 1
        next_pid = dict([(t, 1) for t in TYPE_VALUES])

        for i, l in enumerate(inf):
            l = l.rstrip('\n')

            # parse line into ID and TYPE:LABEL:STRING triples
            try:
                id_, rest = l.split('\t', 1)
            except ValueError:
                if error_count < MAX_ERROR_LINES:
                    print("Error: skipping line %d: expected tab-separated fields, got '%s'" % (
                        i + 1, l), file=sys.stderr)
                elif error_count == MAX_ERROR_LINES:
                    print("(Too many errors; suppressing further error messages)", file=sys.stderr)
                error_count += 1
                continue

            # parse TYPE:LABEL:STRING triples
            try:
                triples = []
                for triple in rest.split('\t'):
                    type_, label, string = triple.split(':', 2)
                    if type_ not in TYPE_VALUES:
                        print("Unknown TYPE %s" % type_, file=sys.stderr)
                    triples.append((type_, label, string))
            except ValueError:
                if error_count < MAX_ERROR_LINES:
                    print("Error: skipping line %d: expected tab-separated TYPE:LABEL:STRING triples, got '%s'" % (
                        i + 1, rest), file=sys.stderr)
                elif error_count == MAX_ERROR_LINES:
                    print("(Too many errors; suppressing further error messages)", file=sys.stderr)
                error_count += 1
                continue

            # insert entity
            eid = next_eid
            next_eid += 1
            try:
                cursor.execute(
                    "INSERT into entities VALUES (?, ?)", (eid, id_))
            except sqlite.IntegrityError as e:
                if error_count < MAX_ERROR_LINES:
                    print("Error inserting %s (skipping): %s" % (
                        id_, e), file=sys.stderr)
                elif error_count == MAX_ERROR_LINES:
                    print("(Too many errors; suppressing further error messages)", file=sys.stderr)
                error_count += 1
                continue

            # insert new labels (if any)
            labels = set([l for t, l, s in triples])
            new_labels = [l for l in labels if l not in label_id]
            for label in new_labels:
                lid = next_lid
                next_lid += 1
                cursor.execute(
                    "INSERT into labels VALUES (?, ?)", (lid, label))
                label_id[label] = lid

            # insert associated strings
            for type_, label, string in triples:
                table = TABLE_FOR_TYPE[type_]
                pid = next_pid[type_]
                next_pid[type_] += 1
                lid = label_id[label]  # TODO
                if TABLE_HAS_NORMVALUE[table]:
                    normstring = string_norm_form(string)
                    cursor.execute(
                        "INSERT into %s VALUES (?, ?, ?, ?, ?)" %
                        table, (pid, eid, lid, string, normstring))
                else:
                    cursor.execute(
                        "INSERT into %s VALUES (?, ?, ?, ?)" %
                        table, (pid, eid, lid, string))

            import_count += 1

            if arg.verbose and (i + 1) % 10000 == 0:
                print('.', end=' ', file=sys.stderr)

        if arg.verbose:
            print("done.", file=sys.stderr)

        # create SQL indices
        if arg.verbose:
            print("Creating indices ...", end=' ', file=sys.stderr)

        for command in CREATE_INDEX_COMMANDS:
            try:
                cursor.execute(command)
            except sqlite.OperationalError as e:
                print("Error creating index", e, file=sys.stderr)
                return 1

        if arg.verbose:
            print("done.", file=sys.stderr)

        # wrap up SQL table creation
        connection.commit()

        # create simstring DB
        if arg.verbose:
            print("Creating simstring DB ...", end=' ', file=sys.stderr)

        try:
            ssdb = simstring.writer(ssdbfn)
            for row in cursor.execute(SELECT_SIMSTRING_STRINGS_COMMAND):
                # encode as UTF-8 for simstring
                s = row[0].encode('utf-8')
                ssdb.insert(s)
                simstring_count += 1
            ssdb.close()
        except BaseException:
            print("Error building simstring DB", file=sys.stderr)
            raise

        if arg.verbose:
            print("done.", file=sys.stderr)

        cursor.close()

    # done
    delta = datetime.now() - start_time

    if arg.verbose:
        print(file=sys.stderr)
        print("Done in:", str(
            delta.seconds) + "." + str(delta.microseconds / 10000), "seconds", file=sys.stderr)

    print("Done, imported %d entries (%d strings), skipped %d duplicate keys, skipped %d invalid lines" % (import_count, simstring_count, duplicate_count, error_count))

    return 0
Ejemplo n.º 15
0
#!/usr/bin/env python
# -*- coding:utf-8 -*-

"""
A Unicode sample.

We assume that the source code is written in UTF-8 encoding (see the
encoding declaration in line 2). We can use 8-bit strings as they are
with SimString.
"""

import simstring

# Open a SimString database for writing with Unicode mode.
db = simstring.writer('sample_unicode.db', 3, False, True)

# Write a string, and close the database.
db.insert('スパゲティ')
db.close()


# Open the SimString database for reading.
db = simstring.reader('sample_unicode.db')

# Set a similarity measure and threshold.
db.measure = simstring.cosine
db.threshold = 0.6

# Use an 8-bit string encoded in UTF-8.
print(' '.join(db.retrieve('スパゲティー')))
Ejemplo n.º 16
0
            except sqlite.OperationalError, e:
                print >> sys.stderr, "Error creating index", e
                return 1

        if arg.verbose:
            print >> sys.stderr, "done."

        # wrap up SQL table creation
        connection.commit()

        # create simstring DB
        if arg.verbose:
            print >> sys.stderr, "Creating simstring DB ...",

        try:
            ssdb = simstring.writer(ssdbfn)
            for row in cursor.execute(SELECT_SIMSTRING_STRINGS_COMMAND):
                # encode as UTF-8 for simstring
                s = row[0].encode('utf-8')
                ssdb.insert(s)
                simstring_count += 1
            ssdb.close()
        except:
            print >> sys.stderr, "Error building simstring DB"
            raise

        if arg.verbose:
            print >> sys.stderr, "done."

        cursor.close()
Ejemplo n.º 17
0
import simstring
import os

# create the databases for Simstring to use for dicitonary matching during preprocessing

# create name database
db = simstring.writer('dicts' + os.sep + 'people.db')
with open('dicts' + os.sep + 'chinese_only.txt') as f:
    for word in f:
        db.insert(word.strip())
with open('dicts' + os.sep + 'english_only.txt') as f:
    for word in f:
        db.insert(word.strip())
with open('dicts' + os.sep + 'frequent_last_names.txt') as f:
    for word in f:
        db.insert(word.strip())
with open('dicts' + os.sep + 'shared.txt') as f:
    for word in f:
        db.insert(word.strip())
db.close()

# create place database
db = simstring.writer('dicts' + os.sep + 'places.db')
with open('dicts' + os.sep + 'city_full.txt') as f:
    for word in f:
        db.insert(word.strip())
with open('dicts' + os.sep + 'country_full.txt') as f:
    for word in f:
        db.insert(word.strip())
with open('dicts' + os.sep + 'region_full.txt') as f:
    for word in f:
Ejemplo n.º 18
0
#!/usr/bin/env python
# -*- coding:utf-8 -*-
"""
A Unicode sample.

We assume that the source code is written in UTF-8 encoding (see the
encoding declaration in line 2). We can use 8-bit strings as they are
with SimString.
"""

import simstring

# Open a SimString database for writing with Unicode mode.
db = simstring.writer('sample_unicode.db', 3, False, True)

# Write a string, and close the database.
db.insert('スパゲティ')
db.close()

# Open the SimString database for reading.
db = simstring.reader('sample_unicode.db')

# Set a similarity measure and threshold.
db.measure = simstring.cosine
db.threshold = 0.6

# Use an 8-bit string encoded in UTF-8.
print(' '.join(db.retrieve('スパゲティー')))

# Convert a Unicode object into an UTF-8 query string.
print(' '.join(db.retrieve(u'スパゲティー'.encode('utf-8'))))
Ejemplo n.º 19
0
            except sqlite.OperationalError, e:
                print >> sys.stderr, "Error creating index", e
                return 1

        if arg.verbose:
            print >> sys.stderr, "done."

        # wrap up SQL table creation
        connection.commit()

        # create simstring DB
        if arg.verbose:
            print >> sys.stderr, "Creating simstring DB ...",
        
        try:
            ssdb = simstring.writer(ssdbfn)
            for row in cursor.execute(SELECT_SIMSTRING_STRINGS_COMMAND):
                # encode as UTF-8 for simstring
                s = row[0].encode('utf-8')
                ssdb.insert(s)
                simstring_count += 1
            ssdb.close()
        except:
            print >> sys.stderr, "Error building simstring DB"
            raise

        if arg.verbose:
            print >> sys.stderr, "done."

        cursor.close()
Ejemplo n.º 20
0
#!/usr/bin/env python

import simstring

# Create a SimString database with two person names.
db = simstring.writer('sample.db')
db.insert('Barack Hussein Obama II')
db.insert('James Gordon Brown')
db.close()


# Open the database for reading.
db = simstring.reader('sample.db')

# Use cosine similarity and threshold 0.6.
db.measure = simstring.cosine
db.threshold = 0.6
print(db.retrieve('Barack Obama'))      # OK.
print(db.retrieve('Gordon Brown'))      # OK.
print(db.retrieve('Obama'))             # Too dissimilar!

# Use overlap coefficient and threshold 1.0.
db.measure = simstring.overlap
db.threshold = 1.
print(db.retrieve('Obama'))             # OK.