def refine_index(): """ Refine the data build by index_raw_forms(), in particular removing minor homographs (both lemma-level homographs and wordform-level homographs). Also swaps in standard lemma forms, main-sense definitions, and thesaurus links. """ # Determine which alien variants are okay to keep (because they don't # shadow main forms). - Alien types are wordforms which begin with # a different letter from their parent lemma, and so wouldn't be # correctly filtered by the main letter-by-letter filtering process. stdout.write("Filtering alien types...\n") allowed_alien_types = _filter_alien_types() stdout.write("...done\n") # Initialize the resources that will be used for look-ups vitalstats = VitalStatisticsCache() main_sense_checker = MainSensesCache(with_definitions=True) for letter in string.ascii_lowercase: stdout.write("Refining index for %s...\n" % letter) blocks = [] for block in _raw_pickle_iterator(letter): blocks.append(block) # Remove duplicate types, so that only the version # in the block with the highest frequency is retained. # Cluster together typeunits with the same wordform + wordclass standardmap = defaultdict(lambda: defaultdict(list)) for i, block in enumerate(blocks): for typeunit in block.standard_types: standardmap[typeunit.wordform][typeunit.wordclassflat].append((i, typeunit)) # Go through each wordclass-cluster for each wordform, and pick # the highest-frequency instance in each case for wordform, wordclasses in standardmap.items(): winners = [] for candidates in wordclasses.values(): # Sort by frequency (highest first) candidates.sort(key=lambda c: c[1].f2000, reverse=True) # Remove the first candidate (the highest-frequency one); # this is the one we'll keep. winners.append(candidates.pop(0)) # Delete all the rest for index, typeunit in candidates: blocks[index].standard_types.discard(typeunit) # We should now be left with the highest-scoring wordclasses # for the current wordform (e.g. the highest-frequency # homograph for spell_VB and the highest-frequency # homograph for one spell_NN). We now need to decide which # of these to keep and which to discard discards = _discardable_homographs(winners) for index, typeunit in discards: blocks[index].standard_types.discard(typeunit) # Remove variant types which either duplicate each other # or that shadow a standard type. (Standard types are always # given precedence.) varmap = defaultdict(list) for i, block in enumerate(blocks): for typeunit in block.variant_types: varmap[typeunit.wordform].append((i, typeunit, block.f2000)) for wordform, candidates in varmap.items(): if wordform not in standardmap: # Sort by the frequency of the parent lemma candidates.sort(key=lambda c: c[2], reverse=True) # Remove the first candidate (the highest-frequency # one); this is the one we'll keep. candidates.pop(0) # Delete all the rest for index, typeunit, _ in candidates: blocks[index].variant_types.discard(typeunit) # Remove any alien types that are not allowed (because they # shadow other standard types or variants). for block in blocks: to_be_deleted = set() for typeunit in block.alien_types: if typeunit.wordform not in allowed_alien_types: to_be_deleted.add(typeunit) for typeunit in to_be_deleted: block.alien_types.discard(typeunit) # Remove any blocks whose standard_types and # variant_types sets have now been completely emptied # For the remainder, turn standard_forms and variant_forms # from sets into lists blocks = [_listify_forms(b) for b in blocks if b.standard_types or b.variant_types] blocks_filtered = [] for block in blocks: language = vitalstats.find(block.refentry, field="indirect_language") if not language and block.start and block.start < 1200: language = "West Germanic" block = _replace_language(block, language) # Acquire main-sense data for this block (which will be # used to swap in a new definition and a thesaurus link) if block.type == "entry": ms_block_data = main_sense_checker.find_block_data(block.refentry, block.refid) if ms_block_data and ms_block_data.senses: main_sense_data = ms_block_data.senses[0] main_sense_confidence = ms_block_data.confidence() else: main_sense_data = None main_sense_confidence = None else: main_sense_data = None main_sense_confidence = None # Swap in thesaurus-class link block = _replace_htclass(block, main_sense_data, main_sense_confidence) if block.type == "entry": # Make sure we use the OED headword, not the headword # that's been used in GEL (which could be the version # of the headword found in ODE or NOAD). headword = vitalstats.find(block.refentry, field="headword") if headword and headword != block.lemma: block = _replace_lemma(block, headword) # Make sure we use the best main-sense definition if main_sense_data and main_sense_data.definition: block = _replace_definition(block, main_sense_data.definition) blocks_filtered.append(block) out_file = os.path.join(FORM_INDEX_DIR, "refined", letter + ".json") with open(out_file, "w") as filehandle: for block in blocks_filtered: filehandle.write(json.dumps(block) + "\n")
def refine_index(self): allowed_alien_types = _filter_alien_types() vitalstats = VitalStatisticsCache() main_sense_checker = MainSensesCache(with_definitions=True) for letter in string.ascii_lowercase: print('Refining index for %s...' % letter) blocks = [] for block in raw_pickle_iterator(letter): blocks.append(block) # Remove duplicate types, so that only the version # in the block with the highest frequency is retained. standardmap = defaultdict(list) for i, block in enumerate(blocks): for wordform in block.standard_types: standardmap[wordform].append((i, block.frequency)) for wordform, candidates in standardmap.items(): if len(candidates) > 1: # Sort by frequency candidates.sort(key=lambda c: c[1], reverse=True) # Remove the first candidate (the highest-frequency # one); this is the one we'll keep. candidates.pop(0) # Delete all the rest for index in [c[0] for c in candidates]: blocks[index].standard_types.discard(wordform) # Remove variant types which either duplicate each other # or that shadow a standard type (standard types are always # given precedence). varmap = defaultdict(list) for i, block in enumerate(blocks): for wordform in block.variant_types: varmap[wordform].append((i, block.frequency)) for wordform, candidates in varmap.items(): if wordform not in standardmap: # Sort by frequency candidates.sort(key=lambda c: c[1], reverse=True) # Remove the first candidate (the highest-frequency # one); this is the one we'll keep. candidates.pop(0) # Delete all the rest for index in [c[0] for c in candidates]: blocks[index].variant_types.discard(wordform) # Remove any alien types that are not allowed (because they # shadow other standard types or variants). for block in blocks: to_be_deleted = set() for wordform in block.alien_types: if wordform not in allowed_alien_types: to_be_deleted.add(wordform) for wordform in to_be_deleted: block.alien_types.discard(wordform) # Remove any blocks whose standard_types and # variant_types sets have now been completely emptied # For the remainder, turn standard_forms and variant_forms # from sets into lists blocks = [_listify_forms(b) for b in blocks if b.standard_types or b.variant_types] blocks_filtered = [] for block in blocks: language = vitalstats.find(block.refentry, field='indirect_language') if not language and block.start and block.start < 1200: language = 'West Germanic' block = _replace_language(block, language) if block.type == 'entry': # Make sure we use the OED headword, not the headword # that's been used in GEL (which could be the version # of the headword found in ODE or NOAD). headword = vitalstats.find(block.refentry, field='headword') if headword and headword != block.lemma: block = _replace_lemma(block, headword) # Make sure we use the correct main-sense definition main_sense = main_sense_checker.find_main_sense_data( block.refentry, block.refid) if main_sense and main_sense.definition: block = _replace_definition(block, main_sense.definition) blocks_filtered.append(block) out_file = os.path.join(FORM_INDEX_DIR, 'refined', letter + '.json') with open(out_file, 'w') as filehandle: for block in blocks_filtered: filehandle.write(json.dumps(block) + '\n')