Exemple #1
0
 def read_name_index():
     return NameIndex.read_tsv(os.path.join(DIR_PATH, 'name_index.tsv'))
Exemple #2
0
    def get_name_proposals(self,
                           false_name,
                           n=4,
                           normalize_digits=False,
                           normalize_extras=False,
                           threshold=60):
        """Prompts manufacturer, model and form from the user

        Args:
            false_name: Name as it exists in the measurement source
            n: Number of proposals to return
            normalize_digits: Normalize all digits to zeros before calculating fuzzy string matching score
            normalize_extras: Remove extra details in the parentheses
            threshold: Score threshold

        Returns:
            NameItem
        """
        def fuzzy(fn, a, b):
            a = a.lower()
            b = b.lower()
            if normalize_digits:
                a = re.sub(r'\d', '0', a).strip()
                b = re.sub(r'\d', '0', b).strip()
            if normalize_extras:
                a = re.sub(r'\(.+\)$', '', a).strip()
                b = re.sub(r'\(.+\)$', '', b).strip()
            return fn(a, b)

        manufacturer, manufacturer_match = self.manufacturers.find(false_name)
        if not manufacturer:
            return NameIndex([])
        false_model = re.sub(re.escape(manufacturer_match),
                             '',
                             false_name,
                             flags=re.IGNORECASE).strip()
        # Select only the items with the same manufacturer
        models = self.name_proposals[self.name_proposals.manufacturer ==
                                     manufacturer]

        # Calculate ratios
        partial_ratios = [
            fuzzy(fuzz.partial_ratio, model, false_model)
            for model in models.model.tolist()
        ]
        ratios = [
            fuzzy(fuzz.ratio, model, false_model)
            for model in models.model.tolist()
        ]

        models = models.assign(partial_ratio=partial_ratios)
        models = models.assign(ratio=ratios)
        models = models[models.partial_ratio >= threshold]
        models.sort_values('ratio', ascending=False, inplace=True)
        proposals = []
        for i, row in models.iterrows():
            proposals.append(
                NameItem(None, f'{manufacturer} {row.model}', row.form))
        ni = NameIndex(items=proposals)
        ni.df = ni.df.head(n)
        return ni
Exemple #3
0
 def get_existing():
     return NameIndex.read_files(
         os.path.join(DIR_PATH, 'data', '*', '*', '*'))
Exemple #4
0
class CrinacleCrawler(Crawler):
    def __init__(self, driver=None):
        self.book_name_index = None
        super().__init__(driver=driver)

    def get_name_proposals(self):
        """Downloads parses phone books to get names

        Returns:
            NameIndex
        """
        names = super().get_name_proposals()
        rows = []

        # Ears-711 measurements name index
        res = requests.get(
            'https://crinacle.com/graphing/data_hp/phone_book.json')
        hp_book = self.parse_book(res.json())
        for false_name, true_name in hp_book.items():
            rows.append([false_name, true_name, 'onear'])

        # IEM measurements name index
        res = requests.get(
            'https://crinacle.com/graphing/data/phone_book.json')
        iem_book = self.parse_book(res.json())
        for false_name, true_name in iem_book.items():
            rows.append([false_name, true_name, 'inear'])

        # Gras measurments name index
        res = requests.get(
            'https://crinacle.com/graphing/data_hp_gras/phone_book.json')
        gras_book = self.parse_book(res.json())
        for false_name, true_name in gras_book.items():
            rows.append([false_name, true_name, 'onear'])

        self.book_name_index = NameIndex(rows)

        names.concat(NameIndex(rows))
        names.remove_duplicates()
        return names

    @staticmethod
    def parse_book(data):
        """Parses a phone book as dict with false names as keys and true names as values.

        Args:
            data: Phone book object

        Returns:
            Dict with false names and true names
        """
        book = dict()
        for manufacturer in data:
            manufacturer_name = manufacturer['name']
            if 'suffix' in manufacturer:
                manufacturer_name += f' {manufacturer["suffix"]}'
            for model in manufacturer['phones']:
                if type(model) == str:
                    # Plain string
                    book[model.strip()] = f'{manufacturer_name} {model}'.strip(
                    )

                else:
                    # Object
                    if type(model['file']) == str:
                        # Single file as string, wrap in list
                        model['file'] = [model['file']]

                    if 'suffix' in model:
                        for f, suffix in zip(model['file'], model['suffix']):
                            book[f.strip(
                            )] = f'{manufacturer_name} {model["name"]} {suffix}'.strip(
                            )
                    else:
                        for f in model['file']:
                            book[f.strip(
                            )] = f'{manufacturer_name} {model["name"]}'.strip(
                            )

        return book

    @staticmethod
    def read_name_index():
        return NameIndex.read_tsv(os.path.join(DIR_PATH, 'name_index.tsv'))

    def write_name_index(self):
        self.name_index.write_tsv(os.path.join(DIR_PATH, 'name_index.tsv'))

    @staticmethod
    def get_existing():
        return NameIndex.read_files(
            os.path.join(DIR_PATH, 'data', '**', '*.csv'))

    def get_urls(self):
        # Link source is not a web page but raw_data folder
        file_paths = dict()

        def add_to(_fp, _rig):
            name = os.path.split(fp)[1]
            name = re.sub(r' [LR]\d*\.txt', '', name).replace('.txt', '')
            name = re.sub(r' #\d$', '', name)
            if name not in file_paths:
                file_paths[name] = dict()
            if _rig not in file_paths[name]:
                file_paths[name][_rig] = []
            file_paths[name][_rig].append(fp)

        patreon_dir = os.path.join(DIR_PATH, 'raw_data')

        # IEMs
        iem_source_paths = list(
            glob(os.path.join(patreon_dir, 'IEM Measurements (TSV)', '*.txt')))
        for fp in iem_source_paths:
            add_to(fp, 'iem')

        # Ears + 711
        legacy_source_paths = list(
            glob(os.path.join(patreon_dir, 'Legacy Data (EARS + 711)',
                              '*.txt')))
        for fp in legacy_source_paths:
            add_to(fp, 'legacy')

        # Gras
        gras_source_paths = list(
            glob(os.path.join(patreon_dir, 'FR Data (CSV)', '*.txt')))
        for fp in gras_source_paths:
            add_to(fp, 'gras')

        for name, rigs_and_file_paths in file_paths.items():
            if ('iem' in rigs_and_file_paths
                    and ('legacy' in rigs_and_file_paths
                         or 'gras' in rigs_and_file_paths)):
                # Remove IEM rig measurements if Ears-711 or GRAS measurements exist
                # This means the headphone is onear model and the files found in IEM folder are duplicates
                del rigs_and_file_paths['iem']

        return file_paths

    def process(self, item, file_paths, target_dir=None):
        if target_dir is None:
            raise TypeError('"target_dir" must be given')
        avg_fr = FrequencyResponse(name=item.true_name)
        avg_fr.raw = np.zeros(avg_fr.frequency.shape)
        for fp in file_paths:
            with open(fp, 'r', encoding='utf-8') as fh:
                s = fh.read()

            freq = []
            raw = []
            for line in s.split('\n'):
                if len(line) == 0 or line[0] == '*':
                    # Skip empty lines and comments
                    if 'C-weighting compensation: On' in line:
                        print(f'C-weighted measurement: {item.false_name}')
                    continue

                frp = line.split(', ')
                if len(frp) == 1:
                    frp = line.split('\t')
                if len(frp) == 1:
                    frp = line.split(' ')
                if len(frp) == 2:
                    f, r = frp
                elif len(frp) == 3:
                    f, r, p = frp
                else:
                    # Must be comment line
                    continue

                if f == '?' or r == '?':
                    # Skip lines with missing data
                    continue

                try:
                    freq.append(float(f))
                    raw.append(float(r))
                except ValueError as err:
                    # Failed to convert values to floats, must be header or comment row, skip
                    continue

            # Create standard fr object
            fr = FrequencyResponse(name=item.true_name,
                                   frequency=freq,
                                   raw=raw)
            fr.interpolate()
            fr.center()
            avg_fr.raw += fr.raw

        avg_fr.raw /= len(file_paths)

        # Save
        dir_path = os.path.join(target_dir, avg_fr.name)
        os.makedirs(dir_path, exist_ok=True)
        file_path = os.path.join(dir_path, f'{avg_fr.name}.csv')
        avg_fr.write_to_csv(file_path)
        print(f'Saved "{avg_fr.name}" to "{file_path}"')

    def prompt(self, false_name, form=None):
        """Prompts user for true name and form based on false name."""
        if self.name_proposals is not None:
            intermediate_name = self.book_name_index.find(
                false_name=false_name)
            if len(intermediate_name) == 0:
                intermediate_name = false_name
            else:
                intermediate_name = intermediate_name.items[0].true_name
            # Name proposals initialized, add matching entries to options in prompt
            matches = []
            matches += self.name_proposals.search_by_false_name(
                intermediate_name)
            matches += self.name_proposals.search_by_true_name(
                intermediate_name)
            names_and_ratios = []
            for match in matches:
                if not match[0].true_name:
                    # Skip items without true name
                    continue
                if form is not None and form != match[0].form:
                    # Skip items which don't match the given form
                    continue
                if match[1] == 100:
                    # Exact match
                    match[0].true_name += ' ✓'
                if match[0].true_name.replace(' ✓', '') not in [
                        x[0].replace(' ✓', '') for x in names_and_ratios
                ]:
                    # New match
                    names_and_ratios.append(
                        (match[0].true_name, match[1], match[0].form))
                else:
                    # Existing match, update ratio
                    for i in range(len(names_and_ratios)):
                        if match[0].true_name.replace(
                                ' ✓', '') == names_and_ratios[i][0].replace(
                                    ' ✓', ''):
                            if match[1] > names_and_ratios[i][1]:
                                names_and_ratios[i] = (match[0].true_name,
                                                       match[1],
                                                       names_and_ratios[i][2])

            name_options = [
                x[0] for x in sorted(
                    names_and_ratios, key=lambda x: x[1], reverse=True)[:4]
            ]
            if intermediate_name not in [
                    s.replace(' ✓', '') for s in name_options
            ]:
                name_options.append(intermediate_name)  # Add the false name

            # Prompt
            true_name = self.prompt_true_name(name_options)

            if true_name is None:
                return None

            # Find and replace true manufacturer name or prompt it
            if self.manufacturers.find(true_name)[0] is None:
                # Unknown manufacturer, find options with the two first words and prompt it
                manufacturer_options = []
                for i in range(1, min(3, len(true_name.split()))):
                    candidate = ' '.join(true_name.split()[:i])
                    print(candidate)
                    manufacturer_options += self.manufacturers.search(
                        candidate)
                    if candidate not in [x[0] for x in manufacturer_options]:
                        manufacturer_options.append((candidate, 0))
                manufacturer_options = sorted(manufacturer_options,
                                              key=lambda x: x[1],
                                              reverse=True)
                manufacturer_options = [x[0] for x in manufacturer_options]
                manufacturer, replace = self.prompt_manufacturer(
                    manufacturer_options)
                _, match = self.manufacturers.find(manufacturer)
                if match:
                    # Add as a new variant in existing manufacturer
                    for m in self.manufacturers.manufacturers:
                        if m[0] == match:
                            m.append(replace)
                else:
                    # Add new manufacturer
                    self.manufacturers.manufacturers.append([manufacturer])
                self.manufacturers.write()
            # Replace
            true_name = self.manufacturers.replace(true_name)

            # Find the answer and select form
            for name, ratio, f in names_and_ratios:
                if true_name == name:
                    form = f
                    break
            true_name = true_name.replace(' ✓', '')

        else:
            true_name = self.prompt_true_name([false_name])

        if true_name is None:
            # User skipped
            return None

        if form is None:
            # Form not found in name proposals, prompt it
            form = self.prompt_form()

        return NameItem(false_name, true_name, form)

    def process_new(self, prompt=True):
        """Processes all new measurements

        Updates name index with the new entries now found in the name index previously.

        Returns:
            None
        """
        for false_name, rigs_and_file_paths in self.urls.items():
            for rig, file_paths in rigs_and_file_paths.items():
                try:
                    ni = self.name_index.find(false_name=false_name)
                    item = ni.items[0] if ni else None

                    if item and item.form == 'ignore':
                        continue

                    # TODO: Infer form from the file path
                    file_paths = [os.path.abspath(p) for p in file_paths]
                    if rig == 'gras':
                        form = 'onear'
                        target_dir = os.path.join(DIR_PATH, 'data', 'onear',
                                                  'GRAS 43AG-7')
                    elif rig == 'legacy':
                        form = 'onear'
                        target_dir = os.path.join(DIR_PATH, 'data', 'onear',
                                                  'Ears-711')
                    else:
                        form = None
                        target_dir = os.path.join(DIR_PATH, 'data', 'inear')

                    if item and item.true_name:
                        # Name index contains the entry
                        if not self.existing.find(true_name=item.true_name):
                            # Doesn't exist yet
                            if form is not None:
                                item.form = form
                            self.process(item,
                                         file_paths,
                                         target_dir=target_dir)

                    else:
                        # Unknown item
                        if prompt:
                            # Prompt true name and form
                            print(f'\n"{false_name}" is not known.')
                            item = self.prompt(false_name, form=form)
                            if item is None:
                                self.name_index.update(NameItem(
                                    false_name, None, 'ignore'),
                                                       false_name=false_name)
                                continue
                            self.name_index.update(item, false_name=false_name)
                            self.process(item,
                                         file_paths,
                                         target_dir=target_dir)
                        else:
                            print(
                                f'"{false_name}" is not known. Add true name and form to name index and run again.'
                            )
                            self.name_index.update(NameItem(
                                false_name, None, None),
                                                   false_name=false_name)
                        self.write_name_index()
                except Exception as err:
                    print(f'Processing failed for "{false_name}"')
                    raise err