def insert_frequency(in_dir, out_dir, freq_dir): """ Find frequency values in the frequency_build data, and inserts them in the GEL data. """ iterator = FileIterator(in_dir=in_dir, out_dir=out_dir, verbosity='low') frequency_finder = FrequencyMemo(freq_dir) for filecontent in iterator.iterate(): for entry in filecontent.entries: for wordclass_set in entry.wordclass_sets(): etree.strip_attributes(wordclass_set.node, 'size') tables = {} for type in wordclass_set.types(): frequencies = frequency_finder.find_frequencies(type.id) if frequencies: tables[type.id] = FrequencyTable(data=frequencies) else: tables[type.id] = None for type in wordclass_set.types(): if tables[type.id]: type.node.append(tables[type.id].to_xml()) non_null_tables = [table for table in tables.values() if table] if non_null_tables: wcs_table = sum_frequency_tables(non_null_tables) wordclass_set.node.append(wcs_table.to_xml())
def compare_singular_to_plural(self, e): for wcs in e.wordclass_sets(): if (wcs.wordclass == 'NN' and wcs.frequency_table().frequency() > 1): groups = defaultdict(list) for type in wcs.types(): groups[type.wordclass].append(type) if 'NN' in groups and 'NNS' in groups: summed_nn = sum_frequency_tables([t.frequency_table() for t in groups['NN'] if t.frequency_table() is not None]) summed_nns = sum_frequency_tables([t.frequency_table() for t in groups['NNS'] if t.frequency_table() is not None]) f_nn = summed_nn.frequency() f_nns = summed_nns.frequency() if f_nn and f_nns / f_nn > 1: self.track['plural_to_singular'].append({ 'label': e.label, 'id': e.id, 'fpm': wcs.frequency_table().frequency(), 'ratio': f_nns / f_nn })
def _construct_node(block, block_type, entry_id, node_id, label, parent_label, frequency_blocks, terse): enode = etree.Element('e', type=block_type, xrid=str(entry_id), xrnode=str(node_id), obsolete=str(block.is_marked_obsolete()), revised=str(block.is_revised), firstDate=str(block.date().start), lastDate=str(block.date().end)) hwnode = etree.SubElement(enode, 'label') hwnode.text = label pnode = etree.SubElement(enode, 'parentLabel') pnode.text = parent_label lemnode = etree.SubElement(enode, 'lemma') lemnode.text = block.lemma defnode = etree.SubElement(enode, 'definition') defnode.text = block.definition(length=DEF_LENGTH, current=True) if frequency_blocks: # Create a frequency node for the entry as a whole, by # summing frequencies for each wordclass if len(frequency_blocks) > 1 or not terse: sumtable = sum_frequency_tables( [blockdata.frequency_table for blockdata in frequency_blocks]) enode.append(sumtable.to_xml()) for blockdata in frequency_blocks: wordclass = blockdata.wordclass frequency_table = blockdata.frequency_table types = blockdata.types wcnode = etree.SubElement(enode, 'wordclass', penn=wordclass) if len(types) > 1 or not terse: wcnode.append(frequency_table.to_xml()) wrapnode = etree.SubElement(wcnode, 'types') for typeunit in types: tnode = etree.SubElement(wrapnode, 'type', penn=typeunit.wordclass) z = etree.SubElement(tnode, 'form') z.text = typeunit.form tnode.append(typeunit.frequency_table.to_xml()) return enode