def get_romania_curata(): from os import path from difflib import SequenceMatcher as sm from itertools import permutations import json from mptracker.nlp import normalize sql_names = [person.name for person in models.Person.query.all()] with open(path.relpath("mptracker/scraper/scraper_curata_out.json"), 'r', encoding='utf-8') as f: scraper_result = json.load(f) with open(path.relpath( 'mptracker/scraper/romania_curata_exceptions.json'), 'r', encoding='utf-8') as f: person_exceptions = json.load(f) def matching_score(first_name, second_name): return sm(None, first_name, second_name).ratio() * 100 def add_person(name, fortune): person = ( models.Person.query .filter_by(name=name) .first() ) if person != None: person.romania_curata = "\n".join(fortune) print("Found a match for ", name.encode('utf-8')) sql_names.remove(name) for name, fortune in scraper_result: name_scraper = normalize(name) max_matching = (0, 0) if name_scraper in person_exceptions: add_person(person_exceptions[name_scraper], fortune) for temporary_sqlname in sql_names: name_sql = normalize(temporary_sqlname) for perm in permutations(name_scraper.split(" ")): current_matching = matching_score(" ".join(perm), name_sql) if max_matching[0] < current_matching: max_matching = (current_matching, temporary_sqlname) if max_matching[0] > 93: add_person(max_matching[1], fortune) models.db.session.commit()
def get_romania_curata(): from os import path from difflib import SequenceMatcher as sm from itertools import permutations import json from mptracker.nlp import normalize sql_names = [person.name for person in models.Person.query.all()] with open(path.relpath("mptracker/scraper/scraper_curata_out.json"), 'r', encoding='utf-8') as f: scraper_result = json.load(f) with open(path.relpath('mptracker/scraper/romania_curata_exceptions.json'), 'r', encoding='utf-8') as f: person_exceptions = json.load(f) def matching_score(first_name, second_name): return sm(None, first_name, second_name).ratio() * 100 def add_person(name, fortune): person = (models.Person.query.filter_by(name=name).first()) if person != None: person.romania_curata = "\n".join(fortune) print("Found a match for ", name.encode('utf-8')) sql_names.remove(name) for name, fortune in scraper_result: name_scraper = normalize(name) max_matching = (0, 0) if name_scraper in person_exceptions: add_person(person_exceptions[name_scraper], fortune) for temporary_sqlname in sql_names: name_sql = normalize(temporary_sqlname) for perm in permutations(name_scraper.split(" ")): current_matching = matching_score(" ".join(perm), name_sql) if max_matching[0] < current_matching: max_matching = (current_matching, temporary_sqlname) if max_matching[0] > 93: add_person(max_matching[1], fortune) models.db.session.commit()
def test_library_import(self): """ Try importing a known good library archive, and verify that the contents of the library have completely replaced the old contents. """ # Create some blocks to overwrite library = LibraryFactory.create(modulestore=self.store) lib_key = library.location.library_key test_block = ItemFactory.create( category="vertical", parent_location=library.location, user_id=self.user.id, publish_item=False, ) test_block2 = ItemFactory.create( category="vertical", parent_location=library.location, user_id=self.user.id, publish_item=False ) # Create a library and blocks that should remain unmolested. unchanged_lib = LibraryFactory.create() unchanged_key = unchanged_lib.location.library_key test_block3 = ItemFactory.create( category="vertical", parent_location=unchanged_lib.location, user_id=self.user.id, publish_item=False ) test_block4 = ItemFactory.create( category="vertical", parent_location=unchanged_lib.location, user_id=self.user.id, publish_item=False ) # Refresh library. library = self.store.get_library(lib_key) children = [self.store.get_item(child).url_name for child in library.children] self.assertEqual(len(children), 2) self.assertIn(test_block.url_name, children) self.assertIn(test_block2.url_name, children) unchanged_lib = self.store.get_library(unchanged_key) children = [self.store.get_item(child).url_name for child in unchanged_lib.children] self.assertEqual(len(children), 2) self.assertIn(test_block3.url_name, children) self.assertIn(test_block4.url_name, children) extract_dir = path(tempfile.mkdtemp(dir=settings.DATA_DIR)) # the extract_dir needs to be passed as a relative dir to # import_library_from_xml extract_dir_relative = path.relpath(extract_dir, settings.DATA_DIR) try: with tarfile.open(path(TEST_DATA_DIR) / 'imports' / 'library.HhJfPD.tar.gz') as tar: safetar_extractall(tar, extract_dir) library_items = import_library_from_xml( self.store, self.user.id, settings.GITHUB_REPO_ROOT, [extract_dir_relative / 'library'], load_error_modules=False, static_content_store=contentstore(), target_id=lib_key ) finally: shutil.rmtree(extract_dir) self.assertEqual(lib_key, library_items[0].location.library_key) library = self.store.get_library(lib_key) children = [self.store.get_item(child).url_name for child in library.children] self.assertEqual(len(children), 3) self.assertNotIn(test_block.url_name, children) self.assertNotIn(test_block2.url_name, children) unchanged_lib = self.store.get_library(unchanged_key) children = [self.store.get_item(child).url_name for child in unchanged_lib.children] self.assertEqual(len(children), 2) self.assertIn(test_block3.url_name, children) self.assertIn(test_block4.url_name, children)
def test_library_import(self): """ Try importing a known good library archive, and verify that the contents of the library have completely replaced the old contents. """ # Create some blocks to overwrite library = LibraryFactory.create(modulestore=self.store) lib_key = library.location.library_key test_block = ItemFactory.create( category="vertical", parent_location=library.location, user_id=self.user.id, publish_item=False, ) test_block2 = ItemFactory.create(category="vertical", parent_location=library.location, user_id=self.user.id, publish_item=False) # Create a library and blocks that should remain unmolested. unchanged_lib = LibraryFactory.create() unchanged_key = unchanged_lib.location.library_key test_block3 = ItemFactory.create( category="vertical", parent_location=unchanged_lib.location, user_id=self.user.id, publish_item=False) test_block4 = ItemFactory.create( category="vertical", parent_location=unchanged_lib.location, user_id=self.user.id, publish_item=False) # Refresh library. library = self.store.get_library(lib_key) children = [ self.store.get_item(child).url_name for child in library.children ] self.assertEqual(len(children), 2) self.assertIn(test_block.url_name, children) self.assertIn(test_block2.url_name, children) unchanged_lib = self.store.get_library(unchanged_key) children = [ self.store.get_item(child).url_name for child in unchanged_lib.children ] self.assertEqual(len(children), 2) self.assertIn(test_block3.url_name, children) self.assertIn(test_block4.url_name, children) extract_dir = path(tempfile.mkdtemp(dir=settings.DATA_DIR)) # the extract_dir needs to be passed as a relative dir to # import_library_from_xml extract_dir_relative = path.relpath(extract_dir, settings.DATA_DIR) try: with tarfile.open( path(TEST_DATA_DIR) / 'imports' / 'library.HhJfPD.tar.gz') as tar: safetar_extractall(tar, extract_dir) library_items = import_library_from_xml( self.store, self.user.id, settings.GITHUB_REPO_ROOT, [extract_dir_relative / 'library'], load_error_modules=False, static_content_store=contentstore(), target_id=lib_key) finally: shutil.rmtree(extract_dir) self.assertEqual(lib_key, library_items[0].location.library_key) library = self.store.get_library(lib_key) children = [ self.store.get_item(child).url_name for child in library.children ] self.assertEqual(len(children), 3) self.assertNotIn(test_block.url_name, children) self.assertNotIn(test_block2.url_name, children) unchanged_lib = self.store.get_library(unchanged_key) children = [ self.store.get_item(child).url_name for child in unchanged_lib.children ] self.assertEqual(len(children), 2) self.assertIn(test_block3.url_name, children) self.assertIn(test_block4.url_name, children)