Ejemplo n.º 1
0
def refine_index():
    """
    Refine the data build by index_raw_forms(), in particular removing minor
    homographs (both lemma-level homographs and wordform-level homographs).

    Also swaps in standard lemma forms, main-sense definitions, and
    thesaurus links.
    """
    # Determine which alien variants are okay to keep (because they don't
    #  shadow main forms). - Alien types are wordforms which begin with
    #  a different letter from their parent lemma, and so wouldn't be
    #  correctly filtered by the main letter-by-letter filtering process.
    stdout.write("Filtering alien types...\n")
    allowed_alien_types = _filter_alien_types()
    stdout.write("...done\n")

    # Initialize the resources that will be used for look-ups
    vitalstats = VitalStatisticsCache()
    main_sense_checker = MainSensesCache(with_definitions=True)

    for letter in string.ascii_lowercase:
        stdout.write("Refining index for %s...\n" % letter)
        blocks = []
        for block in _raw_pickle_iterator(letter):
            blocks.append(block)

        # Remove duplicate types, so that only the version
        #  in the block with the highest frequency is retained.
        # Cluster together typeunits with the same wordform + wordclass
        standardmap = defaultdict(lambda: defaultdict(list))
        for i, block in enumerate(blocks):
            for typeunit in block.standard_types:
                standardmap[typeunit.wordform][typeunit.wordclassflat].append((i, typeunit))
        # Go through each wordclass-cluster for each wordform, and pick
        #  the highest-frequency instance in each case
        for wordform, wordclasses in standardmap.items():
            winners = []
            for candidates in wordclasses.values():
                # Sort by frequency (highest first)
                candidates.sort(key=lambda c: c[1].f2000, reverse=True)
                # Remove the first candidate (the highest-frequency one);
                #  this is the one we'll keep.
                winners.append(candidates.pop(0))
                # Delete all the rest
                for index, typeunit in candidates:
                    blocks[index].standard_types.discard(typeunit)
            # We should now be left with the highest-scoring wordclasses
            #  for the current wordform (e.g. the highest-frequency
            #  homograph for spell_VB and the highest-frequency
            #  homograph for one spell_NN). We now need to decide which
            #  of these to keep and which to discard
            discards = _discardable_homographs(winners)
            for index, typeunit in discards:
                blocks[index].standard_types.discard(typeunit)

        # Remove variant types which either duplicate each other
        #  or that shadow a standard type. (Standard types are always
        #  given precedence.)
        varmap = defaultdict(list)
        for i, block in enumerate(blocks):
            for typeunit in block.variant_types:
                varmap[typeunit.wordform].append((i, typeunit, block.f2000))
        for wordform, candidates in varmap.items():
            if wordform not in standardmap:
                # Sort by the frequency of the parent lemma
                candidates.sort(key=lambda c: c[2], reverse=True)
                # Remove the first candidate (the highest-frequency
                #  one); this is the one we'll keep.
                candidates.pop(0)
            # Delete all the rest
            for index, typeunit, _ in candidates:
                blocks[index].variant_types.discard(typeunit)

        # Remove any alien types that are not allowed (because they
        #  shadow other standard types or variants).
        for block in blocks:
            to_be_deleted = set()
            for typeunit in block.alien_types:
                if typeunit.wordform not in allowed_alien_types:
                    to_be_deleted.add(typeunit)
            for typeunit in to_be_deleted:
                block.alien_types.discard(typeunit)

        # Remove any blocks whose standard_types and
        #  variant_types sets have now been completely emptied
        # For the remainder, turn standard_forms and variant_forms
        #  from sets into lists
        blocks = [_listify_forms(b) for b in blocks if b.standard_types or b.variant_types]

        blocks_filtered = []
        for block in blocks:
            language = vitalstats.find(block.refentry, field="indirect_language")
            if not language and block.start and block.start < 1200:
                language = "West Germanic"
            block = _replace_language(block, language)

            # Acquire main-sense data for this block (which will be
            #  used to swap in a new definition and a thesaurus link)
            if block.type == "entry":
                ms_block_data = main_sense_checker.find_block_data(block.refentry, block.refid)
                if ms_block_data and ms_block_data.senses:
                    main_sense_data = ms_block_data.senses[0]
                    main_sense_confidence = ms_block_data.confidence()
                else:
                    main_sense_data = None
                    main_sense_confidence = None
            else:
                main_sense_data = None
                main_sense_confidence = None

            # Swap in thesaurus-class link
            block = _replace_htclass(block, main_sense_data, main_sense_confidence)

            if block.type == "entry":
                # Make sure we use the OED headword, not the headword
                #  that's been used in GEL (which could be the version
                #  of the headword found in ODE or NOAD).
                headword = vitalstats.find(block.refentry, field="headword")
                if headword and headword != block.lemma:
                    block = _replace_lemma(block, headword)
                # Make sure we use the best main-sense definition
                if main_sense_data and main_sense_data.definition:
                    block = _replace_definition(block, main_sense_data.definition)
            blocks_filtered.append(block)

        out_file = os.path.join(FORM_INDEX_DIR, "refined", letter + ".json")
        with open(out_file, "w") as filehandle:
            for block in blocks_filtered:
                filehandle.write(json.dumps(block) + "\n")
Ejemplo n.º 2
0
    def refine_index(self):
        allowed_alien_types = _filter_alien_types()

        vitalstats = VitalStatisticsCache()
        main_sense_checker = MainSensesCache(with_definitions=True)
        for letter in string.ascii_lowercase:
            print('Refining index for %s...' % letter)
            blocks = []
            for block in raw_pickle_iterator(letter):
                blocks.append(block)

            # Remove duplicate types, so that only the version
            #  in the block with the highest frequency is retained.
            standardmap = defaultdict(list)
            for i, block in enumerate(blocks):
                for wordform in block.standard_types:
                    standardmap[wordform].append((i, block.frequency))
            for wordform, candidates in standardmap.items():
                if len(candidates) > 1:
                    # Sort by frequency
                    candidates.sort(key=lambda c: c[1], reverse=True)
                    # Remove the first candidate (the highest-frequency
                    #  one); this is the one we'll keep.
                    candidates.pop(0)
                    # Delete all the rest
                    for index in [c[0] for c in candidates]:
                        blocks[index].standard_types.discard(wordform)

            # Remove variant types which either duplicate each other
            #  or that shadow a standard type (standard types are always
            #  given precedence).
            varmap = defaultdict(list)
            for i, block in enumerate(blocks):
                for wordform in block.variant_types:
                    varmap[wordform].append((i, block.frequency))
            for wordform, candidates in varmap.items():
                if wordform not in standardmap:
                    # Sort by frequency
                    candidates.sort(key=lambda c: c[1], reverse=True)
                    # Remove the first candidate (the highest-frequency
                    #  one); this is the one we'll keep.
                    candidates.pop(0)
                # Delete all the rest
                for index in [c[0] for c in candidates]:
                    blocks[index].variant_types.discard(wordform)

            # Remove any alien types that are not allowed (because they
            #  shadow other standard types or variants).
            for block in blocks:
                to_be_deleted = set()
                for wordform in block.alien_types:
                    if wordform not in allowed_alien_types:
                        to_be_deleted.add(wordform)
                for wordform in to_be_deleted:
                    block.alien_types.discard(wordform)

            # Remove any blocks whose standard_types and
            #  variant_types sets have now been completely emptied
            # For the remainder, turn standard_forms and variant_forms
            #  from sets into lists
            blocks = [_listify_forms(b) for b in blocks if b.standard_types
                      or b.variant_types]

            blocks_filtered = []
            for block in blocks:
                language = vitalstats.find(block.refentry,
                                           field='indirect_language')
                if not language and block.start and block.start < 1200:
                    language = 'West Germanic'
                block = _replace_language(block, language)

                if block.type == 'entry':
                    # Make sure we use the OED headword, not the headword
                    #  that's been used in GEL (which could be the version
                    #  of the headword found in ODE or NOAD).
                    headword = vitalstats.find(block.refentry,
                                               field='headword')
                    if headword and headword != block.lemma:
                        block = _replace_lemma(block, headword)
                    # Make sure we use the correct main-sense definition
                    main_sense = main_sense_checker.find_main_sense_data(
                        block.refentry,
                        block.refid)
                    if main_sense and main_sense.definition:
                        block = _replace_definition(block, main_sense.definition)
                blocks_filtered.append(block)

            out_file = os.path.join(FORM_INDEX_DIR, 'refined', letter + '.json')
            with open(out_file, 'w') as filehandle:
                for block in blocks_filtered:
                    filehandle.write(json.dumps(block) + '\n')