コード例 #1
0
ファイル: benchmark.py プロジェクト: nullnull/simstring
 def _(bm):
     searcher = Searcher(db, CosineMeasure())
     with open(path, 'r') as lines:
         for i, line in enumerate(lines):
             if i >= SEARCH_COUNT_LIMIT:
                 break
             strings = line.rstrip('\r\n')
             result = searcher.search(strings, 0.8)
コード例 #2
0
ファイル: company_names.py プロジェクト: nullnull/simstring
def output_similar_strings_of_each_line(path):
    db = DictDatabase(CharacterNgramFeatureExtractor(2))
    with open(path, 'r') as lines:
        for line in lines:
            strings = line.rstrip('\r\n')
            db.add(strings)

    searcher = Searcher(db, CosineMeasure())
    with open(path, 'r') as lines:
        for line in lines:
            strings = line.rstrip('\r\n')
            result = [str(round(x[0], 5)) + ' ' + x[1] for x in searcher.ranked_search(strings, 0.8)]
            print("\t".join([strings, ",".join(result)]))
コード例 #3
0
ファイル: test_searcher.py プロジェクト: nullnull/simstring
class TestSearcher(TestCase):
    strings = ['a', 'ab', 'abc', 'abcd', 'abcde']

    def setUp(self):
        db = DictDatabase(CharacterNgramFeatureExtractor(2))
        for string in self.strings:
            db.add(string)
        self.searcher = Searcher(db, CosineMeasure())

    def test_search(self):
        self.assertEqual(self.searcher.search('a', 1.0), ['a'])
        self.assertEqual(self.searcher.search('ab', 1.0), ['ab'])
        self.assertEqual(self.searcher.search('ab', 0.9), ['ab'])
        self.assertEqual(self.searcher.search('ab', 0.5), ['ab', 'abc', 'abcd'])
コード例 #4
0
ファイル: test_searcher.py プロジェクト: nullnull/simstring
 def setUp(self):
     db = DictDatabase(CharacterNgramFeatureExtractor(2))
     for string in self.strings:
         db.add(string)
     self.searcher = Searcher(db, CosineMeasure())
コード例 #5
0
    results = medgate_trial_json(lower, upper, clean_terms, raw_terms)

    with open(output_file, 'w+') as f:
        json.dump(results, f)


try:
    nlp = spacy.load('en_core_web_md')
except:
    os.system('python -m spacy download en_core_web_md')
    nlp = spacy.load('en_core_web_md')

stopwords = set(
    open(os.path.join(sys.path[0], 'stopwords.txt')).read().split('\n'))
database = load_pickle(os.path.join(sys.path[0], 'db.pickle'), 'rb')
searcher = Searcher(database, CosineMeasure())

# Input directory of letters (finds all .txt files and ignores rest)
letter_dir = os.path.join(sys.path[0], 'letter_directory/')
letter_type = '.txt'

# Read in letters
letters = get_letters_incl_spacy(letter_dir, letter_type)

# Cosine thresholds
lower_threshold = 0.95
upper_threshold = 1.00

# Output file name
output_dir = os.path.join(sys.path[0],os.environ["output_dir"])
os.makedirs(output_dir, exist_ok=True)