def _(bm): searcher = Searcher(db, CosineMeasure()) with open(path, 'r') as lines: for i, line in enumerate(lines): if i >= SEARCH_COUNT_LIMIT: break strings = line.rstrip('\r\n') result = searcher.search(strings, 0.8)
def output_similar_strings_of_each_line(path): db = DictDatabase(CharacterNgramFeatureExtractor(2)) with open(path, 'r') as lines: for line in lines: strings = line.rstrip('\r\n') db.add(strings) searcher = Searcher(db, CosineMeasure()) with open(path, 'r') as lines: for line in lines: strings = line.rstrip('\r\n') result = [str(round(x[0], 5)) + ' ' + x[1] for x in searcher.ranked_search(strings, 0.8)] print("\t".join([strings, ",".join(result)]))
class TestSearcher(TestCase): strings = ['a', 'ab', 'abc', 'abcd', 'abcde'] def setUp(self): db = DictDatabase(CharacterNgramFeatureExtractor(2)) for string in self.strings: db.add(string) self.searcher = Searcher(db, CosineMeasure()) def test_search(self): self.assertEqual(self.searcher.search('a', 1.0), ['a']) self.assertEqual(self.searcher.search('ab', 1.0), ['ab']) self.assertEqual(self.searcher.search('ab', 0.9), ['ab']) self.assertEqual(self.searcher.search('ab', 0.5), ['ab', 'abc', 'abcd'])
def setUp(self): db = DictDatabase(CharacterNgramFeatureExtractor(2)) for string in self.strings: db.add(string) self.searcher = Searcher(db, CosineMeasure())
results = medgate_trial_json(lower, upper, clean_terms, raw_terms) with open(output_file, 'w+') as f: json.dump(results, f) try: nlp = spacy.load('en_core_web_md') except: os.system('python -m spacy download en_core_web_md') nlp = spacy.load('en_core_web_md') stopwords = set( open(os.path.join(sys.path[0], 'stopwords.txt')).read().split('\n')) database = load_pickle(os.path.join(sys.path[0], 'db.pickle'), 'rb') searcher = Searcher(database, CosineMeasure()) # Input directory of letters (finds all .txt files and ignores rest) letter_dir = os.path.join(sys.path[0], 'letter_directory/') letter_type = '.txt' # Read in letters letters = get_letters_incl_spacy(letter_dir, letter_type) # Cosine thresholds lower_threshold = 0.95 upper_threshold = 1.00 # Output file name output_dir = os.path.join(sys.path[0],os.environ["output_dir"]) os.makedirs(output_dir, exist_ok=True)