def rename_manufacturers(): manufacturers = ManufacturerIndex() for db in DBS: if os.path.isfile(os.path.join(DIR_PATH, db, 'name_index.tsv')): # Rename entries in name index if such exists name_index = NameIndex.read_tsv( os.path.join(DIR_PATH, db, 'name_index.tsv')) for item in name_index.items: if item.form == 'ignore' or not item.true_name: continue true_name = manufacturers.replace(item.true_name) if true_name is None: print(f'"{name}" not found in manufacturers') continue if true_name == item.true_name: continue print(f'Renamed "{item.true_name}" with "{true_name}"') name_index.update( NameItem(item.false_name, true_name, item.form), item.false_name, item.true_name, item.form) name_index.write_tsv( os.path.join(DIR_PATH, db, 'name_index.tsv')) # Rename existing files existing_files = list( glob(os.path.join(DIR_PATH, db, 'data', '**', '*.csv'), recursive=True)) for fp in existing_files: dir_path, name = os.path.split(fp) name = name.replace('.csv', '') true_name = manufacturers.replace(name) if true_name is None: print(f'"{name}" not found in manufacturers') continue new_dir_path = os.path.abspath( os.path.join(dir_path, os.pardir, true_name)) new_file_path = os.path.join(new_dir_path, f'{true_name}.csv') os.makedirs(new_dir_path, exist_ok=True) if os.path.normcase( os.path.normpath(new_file_path)) != os.path.normcase( os.path.normpath(fp)): print( f'Moved "{os.path.relpath(fp, DIR_PATH)}" to "{os.path.relpath(new_file_path, DIR_PATH)}"' ) shutil.move(fp, new_file_path) try: os.rmdir(dir_path) except OSError: pass
def main(): manufacturers = ManufacturerIndex() for db in ['crinacle', 'headphonecom', 'innerfidelity', 'oratory1990', 'rtings']: if os.path.isfile(os.path.join(DIR_PATH, db, 'name_index.tsv')): name_index = NameIndex.read_tsv(os.path.join(DIR_PATH, db, 'name_index.tsv')) else: name_index = NameIndex() for item in name_index.items: if item.form == 'ignore' or not item.true_name: continue true_name = manufacturers.replace(item.true_name) if true_name is None: print(f'"{name}" not found in manufacturers') continue if true_name == item.true_name: continue print(f'Renamed "{item.true_name}" with "{true_name}"') name_index.update( NameItem(item.false_name, true_name, item.form), item.false_name, item.true_name, item.form ) if name_index: name_index.write_tsv(os.path.join(DIR_PATH, db, 'name_index.tsv')) existing = list(glob(os.path.join(DIR_PATH, db, 'data', '**', '*.csv'), recursive=True)) for fp in existing: dir_path, name = os.path.split(fp) name = name.replace('.csv', '') true_name = manufacturers.replace(name) if true_name is None: print(f'"{name}" not found in manufacturers') continue new_dir_path = os.path.abspath(os.path.join(dir_path, os.pardir, true_name)) new_file_path = os.path.join(new_dir_path, f'{true_name}.csv') os.makedirs(new_dir_path, exist_ok=True) if os.path.normcase(os.path.normpath(new_file_path)) != os.path.normcase(os.path.normpath(fp)): print(f'Moved "{os.path.relpath(fp, DIR_PATH)}" to "{os.path.relpath(new_file_path, DIR_PATH)}"') shutil.move(fp, new_file_path) try: os.rmdir(dir_path) except OSError: pass
class Crawler(ABC): def __init__(self, driver=None): self.driver = driver self.name_index = self.read_name_index() self.name_proposals = self.get_name_proposals() self.existing = self.get_existing() self.urls = self.get_urls() self.manufacturers = ManufacturerIndex() @staticmethod @abstractmethod def read_name_index(): """Reads name index as Index Returns: NameIndex """ pass def get_name_proposals(self): """Gets name proposals for new measurements Returns: NameIndex """ name_proposals = NameIndex() for db in [ 'crinacle', 'oratory1990', 'rtings', 'referenceaudioanalyzer' ]: name_index = NameIndex.read_tsv( os.path.join(DIR_PATH, db, 'name_index.tsv')) name_proposals.concat(name_index) for db in ['innerfidelity', 'headphonecom']: name_index = NameIndex.read_files( os.path.join(DIR_PATH, db, 'data', '**', '*.csv')) name_proposals.concat(name_index) return name_proposals @abstractmethod def write_name_index(self): """Writes name index to a file Returns: Index """ pass @staticmethod @abstractmethod def get_existing(): """Reads existing files as Index Returns: Index """ pass @abstractmethod def get_urls(self): """Crawls measurement URLs Returns: Dict where headphone names are keys and URLs are values """ pass @abstractmethod def process(self, item, url): """Downloads a single URL and processes it Args: item: Item url: URL to measurement Returns: None """ pass @staticmethod def prompt_true_name(name_options): """Prompts true name from the user.""" name_options = name_options if name_options is not None else [] if 'skip' not in name_options: name_options.insert(0, 'skip') s = 'What is it\'s true name?' if len(name_options): s += ' Select a number or write the name if none of the options.' print(s) if len(name_options): print(f'\n'.join(f'[{i}] {o}' for i, o in enumerate(name_options))) while True: name = input('> ') try: name = name_options[int(name)] if name == 'skip': return None break except (KeyError, ValueError): break except IndexError: print('That didn\'t work, try again.') return name @staticmethod def prompt_manufacturer(name_options): """Prompts true manufacturer from the user.""" name_options = name_options if name_options is not None else [] s = 'What is it\'s true manufacturer name?' if len(name_options): s += ' Select a number or write the name if none of the options.' print(s) if len(name_options): print(f'\n'.join(f'[{i + 1}] {o}' for i, o in enumerate(name_options))) while True: name = input('> ') try: name = name_options[int(name) - 1] break except (KeyError, ValueError): break except IndexError: print('That didn\'t work, try again.') print('Which part of the name to replace') replace = input('> ') return name, replace @staticmethod def prompt_form(): """Prompts form from the user.""" options = ['onear', 'inear', 'earbud'] print('What is it\'s type?') print(f'\n'.join(f'[{i + 1}] {o}' for i, o in enumerate(options))) while True: form = input('> ') try: return options[int(form) - 1] except (IndexError, ValueError): print('That didn\'t work, try again.') def prompt(self, false_name): """Prompts user for true name and form based on false name.""" form = None if self.name_proposals is not None: # Name proposals initialized, add matching entries to options in prompt matches = [] matches += self.name_proposals.search_by_false_name(false_name) matches += self.name_proposals.search_by_true_name(false_name) names_and_ratios = [] for match in matches: if not match[0].true_name: # Skip items without true name continue if match[1] == 100: # Exact match match[0].true_name += ' ✓' if match[0].true_name not in [x[0] for x in names_and_ratios]: # New match names_and_ratios.append( (match[0].true_name, match[1], match[0].form)) else: # Existing match, update ratio for i in range(len(names_and_ratios)): if match[0].true_name == names_and_ratios[i][ 0] and match[1] > names_and_ratios[i][1]: names_and_ratios[i] = (names_and_ratios[i][0], match[1], names_and_ratios[i][2]) name_options = [ x[0] for x in sorted( names_and_ratios, key=lambda x: x[1], reverse=True)[:4] ] if false_name not in name_options: name_options.append(false_name) # Add the false name # Prompt true_name = self.prompt_true_name(name_options) if true_name is None: return None # Find and replace true manufacturer name or prompt it if self.manufacturers.find(true_name)[0] is None: # Unknown manufacturer, find options with the two first words and prompt it manufacturer_options = [] for i in range(1, min(3, len(true_name.split()))): candidate = ' '.join(true_name.split()[:i]) print(candidate) manufacturer_options += self.manufacturers.search( candidate) if candidate not in [x[0] for x in manufacturer_options]: manufacturer_options.append((candidate, 0)) manufacturer_options = sorted(manufacturer_options, key=lambda x: x[1], reverse=True) manufacturer_options = [x[0] for x in manufacturer_options] manufacturer, replace = self.prompt_manufacturer( manufacturer_options) _, match = self.manufacturers.find(manufacturer) if match: # Add as a new variant in existing manufacturer for m in self.manufacturers.manufacturers: if m[0] == match: m.append(replace) else: # Add new manufacturer self.manufacturers.manufacturers.append([manufacturer]) self.manufacturers.write() # Replace true_name = self.manufacturers.replace(true_name) # Find the answer and select form for name, ratio, f in names_and_ratios: if true_name == name: form = f break true_name = true_name.replace(' ✓', '') else: true_name = self.prompt_true_name([false_name]) form = None if true_name is None: # User skipped return None if form is None: # Form not found in name proposals, prompt it form = self.prompt_form() return NameItem(false_name, true_name, form) def process_new(self, prompt=True): """Processes all new measurements Updates name index with the new entries now found in the name index previously. Returns: None """ for false_name, url in self.urls.items(): try: ni = self.name_index.find(false_name=false_name) item = ni.items[0] if ni else None if item and item.form == 'ignore': continue if item and item.true_name: # Name index contains the entry if not self.existing.find(true_name=item.true_name): # Doesn't exist already self.process(item, url) else: # Unknown item if prompt: # Prompt true name and form print(f'\n"{false_name}" is not known.') item = self.prompt(false_name) if item is None: self.name_index.update(NameItem( false_name, None, 'ignore'), false_name=false_name) continue self.name_index.update(item, false_name=false_name) self.process(item, url) else: print( f'"{false_name}" is not known. Add true name and form to name index and run again.' ) self.name_index.update(NameItem( false_name, None, None), false_name=false_name) self.write_name_index() except Exception as err: print(f'Processing failed for "{false_name}"') raise err @staticmethod def download(url, true_name, output_dir, file_type=None): """Downloads a file from a URL Args: url: URL to download true_name: True name of the item to download output_dir: Where to write the downloaded file file_type: File extension. Detected automatically if None. Returns: Bool depicting if download succeeded or not """ output_dir = os.path.abspath(output_dir) os.makedirs(output_dir, exist_ok=True) res = requests.get(url, stream=True) if res.status_code != 200: print(f'Failed to download "{true_name}" at "{url}"') return None if file_type is None: file_type = url.split('.')[-1] file_type = file_type.split('?')[0] file_path = os.path.join(output_dir, '{}.{}'.format(true_name, file_type)) with open(file_path, 'wb') as f: res.raw.decode_content = True shutil.copyfileobj(res.raw, f) print('Downloaded to "{}"'.format(file_path)) return file_path def get_beautiful_soup(self, url): self.driver.get(url) html = self.driver.find_element_by_tag_name('html').get_attribute( 'outerHTML') return BeautifulSoup(html, 'html.parser')