def compile_data_points(self, **kwargs): letters = kwargs.get("letters", None) freq_iterator = FrequencyIterator( inDir=self.in_dir, outDir=None, letters=letters, message="Compiling data points" ) # Keys will be wordclass values (NN, NNS, etc.); values will # be a list of data points self.data_points = defaultdict(list) for entry in freq_iterator.iterate(): if entry.gram_count() == 1: # and len(entry.lex_items) == 1: lex_items = self.largest_in_each_wordclass(entry.lex_items) for item in lex_items: for period in item.frequency_table().data: start, end = PERIODS[period] lifespan = start - item.start if lifespan >= -20: wc = wordclass_category(item.wordclass) row = ( item.size(date=start), int(lifespan), start, item.frequency_table().frequency(period=period), ) self.data_points[wc].append(row) self.data_points["ALL"].append(row) for wordclass in self.data_points: self.data_points[wordclass].sort(key=lambda p: p[0]) filepath = os.path.join(PREDICTIONS_DIR, wordclass + ".txt") with (open(filepath, "w")) as fh: for data_point in self.data_points[wordclass]: fh.write("%0.3g\t%d\t%d\t%0.4g\n" % data_point)
def calculate_frequency(in_dir, out_dir): """ Calculate the frequency to be assigned to each lemma/type. """ ihandler = InterjectionHandler(in_dir) ihandler.index_interjections() # Iterate through each entry in the frequency build files freq_iterator = FrequencyIterator(in_dir=in_dir, out_dir=out_dir, message='Calculating frequencies') for entry in freq_iterator.iterate(): if entry.contains_wordclass('UH'): entry.ngram = ihandler.supplement_ngram(entry.form, entry.ngram) _apportion_scores(entry) for item in entry.lex_items: _compute_average_frequencies(item) # Add the entry raw size and weighted size to each lex_item item.node.set('size', '%0.3g' % item.size(mode='weighted')) item.node.set('rawSize', '%d' % item.size(mode='actual')) # Add a full frequency table to each lex_item in the entry data = {p: {'frequency': f, 'estimate': item.estimated[p]} for p, f in item.average_frequency.items()} freq_node = FrequencyTable(data=data).to_xml(band=False, log=False) freq_node.set('wcMethod', item.wordclass_method) item.node.append(freq_node)
def _load_tables(in_dir): FrequencyMemo.data = defaultdict(dict) frequency_iterator = FrequencyIterator(inDir=in_dir, outDir=None, message='Loading frequency tables') for entry in frequency_iterator.iterate(): for item in entry.lex_items: series = [item.frequency(period=period) for period in PERIODS] FrequencyMemo.data[item.type_id] = series
def index_interjections(self): freq_iterator = FrequencyIterator(in_dir=self.in_dir, message='Checking interjections') # Iterate through each entry in the frequency build files, looking # for entries for which BNC gives an interjection ratio; cache # the main ngram in memory for entry in freq_iterator.iterate(): wordform = entry.form bnc_prob = BNC_PROB.find(wordform) if bnc_prob and bnc_prob.interjection_ratio: self.index[wordform] = entry.raw_ngram()
def insert_ngram_values(in_dir, out_dir): """ Fetch ngrams from the Google Ngram repository, and insert them into the frequency build data """ for letter in alphabet: # Load in the list of all forms from the lexicon forms = {} freq_iterator = FrequencyIterator(inDir=in_dir, outDir=None, message='Loading ngram data', letters=letter) for entry in freq_iterator.iterate(): forms[entry.form] = list() # Hunt for these lemmas in the GBN data for gram_count in (1, 2, 3, 4): print('\tchecking %s/%d...' % (letter, gram_count)) gbn_iterator = TableIterator(gramCount=gram_count, letter=letter, verbose=False) for ngram in gbn_iterator.iterate(): if ngram.lemma in forms: line = '%d\t%s' % (gram_count, ngram.line) forms[ngram.lemma].append(line) # Add GBN stats to the list of forms freq_iterator = FrequencyIterator(inDir=in_dir, outDir=out_dir, letters=letter) for entry in freq_iterator.iterate(): gbn_node = etree.SubElement(entry.node, 'gbn') for line in forms[entry.form]: parts = line.split('\t') gram_count = parts.pop(0) parts.pop(0) # remove the sortcode parts.pop(0) # remove the form ngram_node = etree.SubElement(gbn_node, 'ngram') ngram_node.set('n', gram_count) ngram_node.set('wordclass', parts.pop(0)) if parts: ngram_node.text = ' '.join(parts)
def find_gaps(in_dir): """ Check for lemmas which don't have any ngram data, and try to find these ngrams from the Google Ngram Viewer site """ freq_iterator = FrequencyIterator(inDir=in_dir, outDir=None, message='Checking for gaps') # Iterate through each entry in the frequency build files gaps = [] for entry in freq_iterator.iterate(): if (not entry.tagged_ngrams and not "'s-" in entry.form and is_4gram_or_5gram(entry.form)): #is_initialled_name(entry.form)): gaps.append(entry.form) outfile = os.path.join(GBN_DIR, '4', 'tmp', next_filename(GBN_DIR)) with open(outfile, 'w') as filehandle: for g in gaps: filehandle.write(g + '\n')