コード例 #1
0
    def compile_data_points(self, **kwargs):
        letters = kwargs.get("letters", None)
        freq_iterator = FrequencyIterator(
            inDir=self.in_dir, outDir=None, letters=letters, message="Compiling data points"
        )

        # Keys will be wordclass values (NN, NNS, etc.); values will
        # be a list of data points
        self.data_points = defaultdict(list)

        for entry in freq_iterator.iterate():
            if entry.gram_count() == 1:  # and len(entry.lex_items) == 1:
                lex_items = self.largest_in_each_wordclass(entry.lex_items)
                for item in lex_items:
                    for period in item.frequency_table().data:
                        start, end = PERIODS[period]
                        lifespan = start - item.start
                        if lifespan >= -20:
                            wc = wordclass_category(item.wordclass)
                            row = (
                                item.size(date=start),
                                int(lifespan),
                                start,
                                item.frequency_table().frequency(period=period),
                            )
                            self.data_points[wc].append(row)
                            self.data_points["ALL"].append(row)

        for wordclass in self.data_points:
            self.data_points[wordclass].sort(key=lambda p: p[0])
            filepath = os.path.join(PREDICTIONS_DIR, wordclass + ".txt")
            with (open(filepath, "w")) as fh:
                for data_point in self.data_points[wordclass]:
                    fh.write("%0.3g\t%d\t%d\t%0.4g\n" % data_point)
コード例 #2
0
def calculate_frequency(in_dir, out_dir):
    """
    Calculate the frequency to be assigned to each lemma/type.
    """
    ihandler = InterjectionHandler(in_dir)
    ihandler.index_interjections()

    # Iterate through each entry in the frequency build files
    freq_iterator = FrequencyIterator(in_dir=in_dir,
                                      out_dir=out_dir,
                                      message='Calculating frequencies')
    for entry in freq_iterator.iterate():
        if entry.contains_wordclass('UH'):
            entry.ngram = ihandler.supplement_ngram(entry.form, entry.ngram)

        _apportion_scores(entry)
        for item in entry.lex_items:
            _compute_average_frequencies(item)
            # Add the entry raw size and weighted size to each lex_item
            item.node.set('size', '%0.3g' % item.size(mode='weighted'))
            item.node.set('rawSize', '%d' % item.size(mode='actual'))
            # Add a full frequency table to each lex_item in the entry
            data = {p: {'frequency': f, 'estimate': item.estimated[p]}
                    for p, f in item.average_frequency.items()}
            freq_node = FrequencyTable(data=data).to_xml(band=False, log=False)
            freq_node.set('wcMethod', item.wordclass_method)
            item.node.append(freq_node)
コード例 #3
0
ファイル: frequencymemo.py プロジェクト: necrop/gel_build
def _load_tables(in_dir):
    FrequencyMemo.data = defaultdict(dict)
    frequency_iterator = FrequencyIterator(inDir=in_dir,
                                           outDir=None,
                                           message='Loading frequency tables')
    for entry in frequency_iterator.iterate():
        for item in entry.lex_items:
            series = [item.frequency(period=period) for period in PERIODS]
            FrequencyMemo.data[item.type_id] = series
コード例 #4
0
 def index_interjections(self):
     freq_iterator = FrequencyIterator(in_dir=self.in_dir,
                                       message='Checking interjections')
     # Iterate through each entry in the frequency build files, looking
     #   for entries for which BNC gives an interjection ratio; cache
     #   the main ngram in memory
     for entry in freq_iterator.iterate():
         wordform = entry.form
         bnc_prob = BNC_PROB.find(wordform)
         if bnc_prob and bnc_prob.interjection_ratio:
             self.index[wordform] = entry.raw_ngram()
コード例 #5
0
def insert_ngram_values(in_dir, out_dir):
    """
    Fetch ngrams from the Google Ngram repository, and insert them
    into the frequency build data
    """
    for letter in alphabet:
        # Load in the list of all forms from the lexicon
        forms = {}
        freq_iterator = FrequencyIterator(inDir=in_dir,
                                          outDir=None,
                                          message='Loading ngram data',
                                          letters=letter)
        for entry in freq_iterator.iterate():
            forms[entry.form] = list()

        # Hunt for these lemmas in the GBN data
        for gram_count in (1, 2, 3, 4):
            print('\tchecking %s/%d...' % (letter, gram_count))
            gbn_iterator = TableIterator(gramCount=gram_count,
                                         letter=letter,
                                         verbose=False)
            for ngram in gbn_iterator.iterate():
                if ngram.lemma in forms:
                    line = '%d\t%s' % (gram_count, ngram.line)
                    forms[ngram.lemma].append(line)

        # Add GBN stats to the list of forms
        freq_iterator = FrequencyIterator(inDir=in_dir,
                                          outDir=out_dir,
                                          letters=letter)
        for entry in freq_iterator.iterate():
            gbn_node = etree.SubElement(entry.node, 'gbn')
            for line in forms[entry.form]:
                parts = line.split('\t')
                gram_count = parts.pop(0)
                parts.pop(0)  # remove the sortcode
                parts.pop(0)  # remove the form
                ngram_node = etree.SubElement(gbn_node, 'ngram')
                ngram_node.set('n', gram_count)
                ngram_node.set('wordclass', parts.pop(0))
                if parts:
                    ngram_node.text = ' '.join(parts)
コード例 #6
0
ファイル: gapfiller.py プロジェクト: necrop/gel_build
def find_gaps(in_dir):
    """
    Check for lemmas which don't have any ngram data, and try to find these
    ngrams from the Google Ngram Viewer site
    """
    freq_iterator = FrequencyIterator(inDir=in_dir,
                                      outDir=None,
                                      message='Checking for gaps')

    # Iterate through each entry in the frequency build files
    gaps = []
    for entry in freq_iterator.iterate():
        if (not entry.tagged_ngrams and
                not "'s-" in entry.form and
                is_4gram_or_5gram(entry.form)):
                #is_initialled_name(entry.form)):
            gaps.append(entry.form)

    outfile = os.path.join(GBN_DIR, '4', 'tmp', next_filename(GBN_DIR))
    with open(outfile, 'w') as filehandle:
        for g in gaps:
            filehandle.write(g + '\n')