def main(sentences, language, outfile, confidence, processes): """ Perform entity linking over a set of input sentences. The service is Dandelion Entity Extraction API: https://dandelion.eu/docs/api/datatxt/nex/v1/ . Links having confidence score below the given threshold are discarded. """ def worker(row): sentence = json.loads(row) text = sentence.get('text') if text: sentence['linked_entities'] = link(text, confidence, language) return json.dumps(sentence) count = 0 for each in parallel.map(worker, sentences, processes): outfile.write(each) outfile.write('\n') count += 1 if count % 1000 == 0: logger.info('Linked %d sentences', count) if count > 0: logger.info("Dumped linked sentences to '%s'" % outfile.name) logger.info('Done, linked %d sentences', count)
def main(sentences, model, language, outfile, processes, gazetteer): gazetteer = reverse_gazetteer(json.load(gazetteer)) if gazetteer else {} logger.info("Loading model from '%s' ...", model) model, extractor = joblib.load(model) classifier = SentenceClassifier(model, extractor, language, gazetteer) def worker(batch): data = (json.loads(s) for s in batch) for classified in classifier.classify_sentences(data): yield json.dumps(classified) count = 0 for each in parallel.map(worker, sentences, batch_size=1000, flatten=True, processes=processes): outfile.write(each) outfile.write('\n') count += 1 if count % 1000 == 0: logger.info('Classified %d sentences', count) logger.info('Done, classified %d sentences', count) if count > 0: logger.info("Dumped classified sentences to '%s'", outfile.name)
def main(corpus, verbs, processes, outfile, sub_sentences): """ Compute the LU distribution in the corpus, i.e. how many LUs per sentence """ global splitter, tagger, parser, all_verbs splitter = PunktSentenceSplitter('en') tagger = TTPosTagger('en') parser = StanfordParser( path_to_jar='dev/stanford-corenlp-3.6.0.jar', path_to_models_jar='dev/stanford-corenlp-3.6.0-models.jar', java_options=' -mx1G -Djava.ext.dirs=dev/' ) # no way to make classpath work all_verbs = reduce(lambda x, y: x.union(y), imap(set, json.load(verbs).values()), set()) all_verbs.discard('be') all_verbs.discard('have') args = load_corpus(corpus, 'bio', text_only=True) worker = worker_with_sub_sentences if sub_sentences else worker_with_sentences counter = defaultdict(int) for i, counts in enumerate(parallel.map(worker, args, processes)): for k, v in counts.iteritems(): counter[k] += v if (i + 1) % 10000 == 0: logger.info('Processed %d documents', i + 1) counter = OrderedDict(sorted(counter.items(), key=lambda (k, v): k)) for k, v in counter.iteritems(): print k, v json.dump(counter, outfile, indent=2)
def process_corpus(self, items, output_file, dump_unresolved_file=None, genealogics=None, processes=0): count = skipped = 0 genealogics_url_to_id = {} for success, item in parallel.map(self.serialize_item, items, processes, flatten=True): if success: subj, prop, val, url = item statement = wikidata.finalize_statement( subj, prop, val, self.language, url, resolve_property=False, resolve_value=False ) if not statement: continue output_file.write(statement.encode('utf8')) output_file.write('\n') if genealogics and url.startswith('http://www.genealogics.org/'): genealogics_url_to_id[url] = subj count += 1 if count % 10000 == 0: logger.info('Produced %d statements so far, skipped %d names', count, skipped) else: skipped += 1 if dump_unresolved_file: dump_unresolved_file.write(json.dumps(item)) dump_unresolved_file.write('\n') logger.info('Produced %d statements so far, skipped %d names', count, skipped) return genealogics_url_to_id, count, skipped
def label_sentences(self, sentences, normalize_numerical, score_type, core_weight, processes=0, input_encoded=False, output_encoded=False): """ Process all the given sentences with the rule-based classifier, optionally giving a confidence score :param sentences: List of sentence data :param normalize_numerical: Whether to automatically normalize numerical expressions :param score_type: Which type of score (if any) to use to compute the classification confidence :param core_weight: Weight of the core FEs (used in the scoring) :param processes: how many processes to use to concurrently label sentences :param input_encoded: whether the corpus is an iterable of dictionaries or an iterable of JSON-encoded documents. JSON-encoded documents are preferable over large size dictionaries for performance reasons :param output_encoded: whether to return a generator of dictionaries or a generator of JSON-encoded documents. Prefer encoded output for performance reasons :return: Generator of labeled sentences """ def worker(item): if input_encoded: item = json.loads(item) labeled = self.label_sentence(item, normalize_numerical, score_type, core_weight) if labeled: return json.dumps(labeled) if output_encoded else labeled for each in parallel.map(worker, sentences, processes): yield each
def lu_count(sentences, processes=0, input_encoded=False): """ Count how many sentences per LU there are for each source :param iterable sentences: Corpus with the POS-tagged sentences :param int processes: how many processes to use for parallel execution :param bool input_encoded: whether the corpus is an iterable of dictionaries or an iterable of JSON-input_encoded documents. JSON-input_encoded documents are preferable over large size dictionaries for performance reasons :return: A dictionary source -> frequencies, where frequencies is another dictionary lemma -> count :type: bool """ def worker(batch): freqs = defaultdict(lambda: 0) for row in batch: sentence = json.loads(row) if input_encoded else row parsed = urlparse(sentence['url']) if not parsed.netloc: logger.warn('cannot parse URL: %s', sentence['url']) return lu = sentence['lu'] freqs[(parsed.netloc, lu)] += 1 return freqs.items() frequencies = defaultdict(lambda: defaultdict(lambda: 0)) for (source, lemma), count in parallel.map(worker, sentences, processes, batch_size=100, flatten=True): frequencies[source][lemma] += count return frequencies
def main(corpus, verbs, processes, outfile, sub_sentences): """ Compute the LU distribution in the corpus, i.e. how many LUs per sentence """ global splitter, tagger, parser, all_verbs splitter = PunktSentenceSplitter("en") tagger = TTPosTagger("en") parser = StanfordParser( path_to_jar="dev/stanford-corenlp-3.6.0.jar", path_to_models_jar="dev/stanford-corenlp-3.6.0-models.jar", java_options=" -mx1G -Djava.ext.dirs=dev/", ) # no way to make classpath work all_verbs = reduce(lambda x, y: x.union(y), imap(set, json.load(verbs).values()), set()) all_verbs.discard("be") all_verbs.discard("have") args = load_corpus(corpus, "bio", text_only=True) worker = worker_with_sub_sentences if sub_sentences else worker_with_sentences counter = defaultdict(int) for i, counts in enumerate(parallel.map(worker, args, processes)): for k, v in counts.iteritems(): counter[k] += v if (i + 1) % 10000 == 0: logger.info("Processed %d documents", i + 1) counter = OrderedDict(sorted(counter.items(), key=lambda (k, v): k)) for k, v in counter.iteritems(): print k, v json.dump(counter, outfile, indent=2)
def test_batches(self): def consumer(bulk): self.assertEqual(len(bulk), batch_size) return True for batch_size in range(1, 10, 2): data = range(batch_size * 5) self.assertTrue(all(parallel.map(consumer, data, processes=5, batch_size=batch_size)))
def about_sources(corpus, processes, with_bio): """ Items' sources """ def worker(items): sources = defaultdict(int) for doc in items: url = doc.get('url') if not url: logger.warn('found an item without URL, name: %s, bio: %s', doc.get('name'), doc.get('bio', '')[:100] + ' ...') sources['_skipped_'] += 1 continue elif with_bio and len(doc.get('bio') or '') < 5: continue parsed = urlparse(url) if parsed.netloc: sources[parsed.netloc] += 1 else: logger.warn('cannot parse URL: %s', url) sources['_skipped_'] += 1 return sources aggregated_sources = defaultdict(int) corpus = parallel.make_batches(load_scraped_items(corpus), 1000) for sources in parallel.map(worker, corpus, processes): for k, v in sources.iteritems(): aggregated_sources[k] += v aggregated_sources = sorted(aggregated_sources.items(), key=lambda (_, v): v, reverse=True) for source, count in aggregated_sources: print source, count try: import matplotlib.pyplot as plt except ImportError: logger.warn('Cannot import matplotlib, skipping chart') return count = sum(c for s, c in aggregated_sources) display_sources = filter(lambda (s, v): float(v) / count >= 0.01, aggregated_sources) sources, values = map(list, zip(*display_sources)) sources.append('Rest') values.append(count - sum(values)) plt.pie(values, labels=sources) plt.axis('equal') plt.show()
def extract_sentences(sentences, probabilities, processes=0, input_encoded=False, output_encoded=False): """ Extracts some sentences from the corpus following the given probabilities :param iterable sentences: Extracted sentences :param dict probabilities: Conditional probabilities of extracting a sentence containing a specific LU given the source of the sentence. It is therefore a mapping source -> probabilities, where probabilities is itself a mapping LU -> probability :param int processes: how many processes to use for parallel execution :param bool input_encoded: whether the corpus is an iterable of dictionaries or an iterable of JSON-encoded documents. JSON-encoded documents are preferable over large size dictionaries for performance reasons :param bool output_encoded: whether to return a generator of dictionaries or a generator of JSON-encoded documents. Prefer encoded output for performance reasons :return: Generator of sentences """ def worker(batch): for row in batch: sentence = json.loads(row) if input_encoded else row parsed = urlparse(sentence['url']) if not parsed.netloc: logger.warn('cannot parse URL: %s', sentence['url']) return lu = sentence['lu'] p = probabilities[(parsed.netloc, lu)] if random.random() < p: yield parsed.netloc, lu, json.dumps( sentence) if output_encoded else sentence counts = defaultdict(lambda: 0) for source, lu, sentence in parallel.map(worker, sentences, processes, batch_size=100, flatten=True): counts[(source, lu)] += 1 yield sentence aggs_lu = defaultdict(lambda: 0) aggs_source = defaultdict(lambda: 0) for (source, lu), n in counts.iteritems(): aggs_lu[lu] += n aggs_source[source] += n logger.debug('aggregated statistics per LU: %s', aggs_lu) logger.debug('aggregated statistics per source: %s', aggs_source)
def find_ranking(self, processes=0): """ Ranks the verbs :param int processes: How many processes to use for parallel ranking :return: tuple with average tf-idf and average standard deviation ordered rankings :rtype: tuple of (OrderedDict, OrderedDict) """ tfidf_ranking = {} stdev_ranking = {} for lemma, tfidf, stdev in parallel.map(self.score_lemma, self.verbs, processes): tfidf_ranking[lemma] = tfidf stdev_ranking[lemma] = stdev return (OrderedDict(sorted(tfidf_ranking.items(), key=lambda x: x[1], reverse=True)), OrderedDict(sorted(stdev_ranking.items(), key=lambda x: x[1], reverse=True)))
def main(corpus, document_key, language_code, outfile, processes): """ Split an input corpus into sentences """ corpus = load_corpus(corpus, document_key, text_only=True) s = PunktSentenceSplitter(language_code) logger.info("Starting sentence splitting of the input corpus ...") def worker((i, text)): sentences = list(s.split(text)) return json.dumps({i: sentences}) if sentences else None for sentences in parallel.map(worker, enumerate(corpus), processes): outfile.write(sentences) outfile.write('\n') return 0
def find_ranking(self, processes=0, bulk_size=10000, normalize=True): ranking = defaultdict(int) for score in parallel.map(self.score_from_tokens, self._bulkenize(self.tags, bulk_size), processes): for k, v in score.iteritems(): ranking[k] += v ranking = OrderedDict(sorted(ranking.items(), key=lambda x: x[1], reverse=True)) if normalize: max_score = float(ranking[next(iter(ranking))]) for lemma, score in ranking.iteritems(): ranking[lemma] = score / max_score return ranking
def main(classified, lexical_db, outfile, language, semistructured, processes, dump_unresolved): """ Serialize classification results into quickstatements """ if semistructured: url_to_wid = map_url_to_wid(semistructured) logger.info( 'Used semi-structured dataset to infer %d Wikidata Item IDs', len(url_to_wid)) else: url_to_wid = {} logger.info('TIP: using the semi-structured dataset could help in ' 'resolving the Wikidata Item ID of more subjects') lexical_db = json.load(lexical_db) count = skipped = 0 serializer = ClassificationSerializer(language, lexical_db, url_to_wid) for success, item in parallel.map(serializer.to_statements, classified, processes=processes, flatten=True): if success: outfile.write(item.encode('utf8')) outfile.write('\n') count += 1 else: skipped += 1 if dump_unresolved: dump_unresolved.write(json.dumps(item)) dump_unresolved.write('\n') if count % 1000 == 0 and count > 0: logger.info('Produced %d statements so far, skipped %d names', count, skipped) logger.info('Done, produced %d statements, skipped %d names', count, skipped) logger.info("Dataset serialized to '%s'" % outfile.name) if dump_unresolved: logger.info("Unresolved entities dumped to '%s'" % dump_unresolved.name)
def main(classified, lexical_db, outfile, language, semistructured, processes, dump_unresolved): """ Serialize classification results into quickstatements """ if semistructured: url_to_wid = map_url_to_wid(semistructured) logger.info('Used semi-structured dataset to infer %d Wikidata Item IDs', len(url_to_wid)) else: url_to_wid = {} logger.info('TIP: using the semi-structured dataset could help in ' 'resolving the Wikidata Item ID of more subjects') lexical_db = json.load(lexical_db) count = skipped = 0 serializer = ClassificationSerializer(language, lexical_db, url_to_wid) for success, item in parallel.map(serializer.to_statements, classified, processes=processes, flatten=True): if success: outfile.write(item.encode('utf8')) outfile.write('\n') count += 1 else: skipped += 1 if dump_unresolved: dump_unresolved.write(json.dumps(item)) dump_unresolved.write('\n') if count % 1000 == 0 and count > 0: logger.info('Produced %d statements so far, skipped %d names', count, skipped) logger.info('Done, produced %d statements, skipped %d names', count, skipped) logger.info("Dataset serialized to '%s'" % outfile.name) if dump_unresolved: logger.info("Unresolved entities dumped to '%s'" % dump_unresolved.name)
def extract(self, processes=0): """ Processes the corpus extracting sentences from each item and storing them in the item itself. :param int processes: how many processes to use for parallel tagging :return: the extracted sentences :type: generator of dicts """ self.setup_extractor() try: count = 0 for i, (item, extracted) in enumerate( parallel.map(self.extract_from_item, self.corpus, processes)): if not item.get('name') or not item.get('url'): logger.warn('Skipping item without name or URL') continue # assign an unique incremental ID to each sentence # and store information about the originating document for each in extracted: each['id'] = count each['url'] = item['url'] each['name'] = item['name'] count += 1 yield each if (i + 1) % 10000 == 0: logger.info('Processed %d items, extracted %d sentences', i + 1, count) logger.info('Done, total sentences extracted: %d', count) finally: self.teardown_extractor()
def extract(self, processes=0): """ Processes the corpus extracting sentences from each item and storing them in the item itself. :param int processes: how many processes to use for parallel tagging :return: the extracted sentences :type: generator of dicts """ self.setup_extractor() try: count = 0 for i, (item, extracted) in enumerate(parallel.map(self.extract_from_item, self.corpus, processes)): if not item.get('name') or not item.get('url'): logger.warn('Skipping item without name or URL') continue # assign an unique incremental ID to each sentence # and store information about the originating document for each in extracted: each['id'] = count each['url'] = item['url'] each['name'] = item['name'] count += 1 yield each if (i + 1) % 10000 == 0: logger.info('Processed %d items, extracted %d sentences', i + 1, count) logger.info('Done, total sentences extracted: %d', count) finally: self.teardown_extractor()
def test_exception_single(self): self.assertRaises(ValueError, self.consume, parallel.map(self.exc_function, self.list_in, processes=1, raise_exc=True))
def test_flatten_multi_process(self): list_out = parallel.map(self.multi_function, self.list_in, processes=2, flatten=True) self.assertEqual(Counter(list_out), Counter(self.correct_multi))
def test_more_workers(self): list_out = set(parallel.map(self.function, self.list_in, processes=20)) self.assertEqual(list_out, self.correct)
def test_with_nones_multi_process(self): list_out = set(parallel.map(self.function, self.list_in_nones, processes=1)) self.assertEqual(list_out, self.correct_nones)
def test_multi_process(self): list_out = set(parallel.map(self.function, self.list_in, processes=2)) self.assertEqual(list_out, self.correct)