def process_semistructured(corpus_dir, outfile, language, processes, sourced_only, genealogics, dump_unresolved): """ Processes the corpus and extracts semi-structured data serialized into QuickStatements. Needs a second pass on genealogics to correctly resolve family members. """ resolver = SemistructuredSerializer(language, sourced_only, ) genealogics_url_to_id, count, skipped = resolver.process_corpus( io.load_scraped_items(corpus_dir), outfile, dump_unresolved, genealogics, processes ) logger.info('Done, produced %d statements, skipped %d names', count, skipped) if not genealogics: logger.info("Dataset serialized to '%s'" % outfile.name) return logger.info('Starting second pass on genealogics ...') genealogics_data = resolver.resolve_genealogics_family(genealogics, genealogics_url_to_id) for success, item in genealogics_data: if success: outfile.write(item.encode('utf8')) outfile.write('\n') count += 1 if count % 10000 == 0: logger.info('Produced %d statements so far, skipped %d names', count, skipped) else: skipped += 1 if dump_unresolved: dump_unresolved.write(json.dumps(item)) dump_unresolved.write('\n') logger.info('Done, produced %d statements, skipped %d names', count, skipped) logger.info("Dataset serialized to '%s'" % outfile.name)
def about_biographies_count(corpus): """ Finds how many items have/don't have a biography """ count = with_bio = characters = 0 for doc in load_scraped_items(corpus): count += 1 if doc.get('bio') and len(doc['bio']) > 5: with_bio += 1 characters += len(doc['bio']) print 'Total number of items:', count print 'Items with a biography %d (%.2f %%)' % (with_bio, 100. * with_bio / count) print 'Cumulative length of biographies: %d characters' % characters try: import matplotlib.pyplot as plt except ImportError: logger.warn('Cannot import matplotlib, skipping chart') return plt.bar([0, 1], [count - with_bio, with_bio], width=0.75) plt.xticks([0.375, 1.375], ['Without Biography', 'With Biography']) plt.grid(True, axis='y') plt.xlim((-0.5, 2.25)) plt.show()
def about_biographies_length(corpus, bins, log_y): """ Computes an histogram of biography length """ lengths = [] for doc in load_scraped_items(corpus): if len(doc.get('bio') or '') > 5: lengths.append(len(doc['bio'])) width = float(max(lengths)) / bins buckets = defaultdict(int) for each in lengths: buckets[int(each / width)] += 1 for i in xrange(max(buckets.keys())): print '%d - %d: %d' % (i * width, (i + 1) * width - 1, buckets[i]) try: import matplotlib.pyplot as plt except ImportError: logger.warn('Cannot import matplotlib, skipping chart') return plt.title('Biography length distribution for %d items' % len(lengths)) plt.xlabel('Biography length in characters') plt.ylabel('Number of items') plt.hist(lengths, bins=bins, log=log_y) plt.grid(True) plt.tight_layout() plt.show()
def main(corpus, document_key, pos_tag_key, language_code, tagger, outfile, tt_home, batch_size): """ Perform part-of-speech (POS) tagging over an input corpus. """ if tagger == 'tt': pos_tagger = TTPosTagger(language_code, tt_home) logger.info( "About to perform part-of-speech tagging with TreeTagger ...") else: pos_tagger = NLTKPosTagger(language_code) logger.info( "About to perform part-of-speech tagging with NLTK tagger ...") corpus = load_scraped_items(corpus) total = 0 for i, tagged_document in enumerate( pos_tagger.tag_many(corpus, document_key, pos_tag_key, batch_size)): total += 1 outfile.write(json.dumps(tagged_document) + '\n') if (i + 1) % 10000 == 0: logger.info('processed %d items', i + 1) logger.info("Done, total tagged items: %d" % total) return 0
def main(corpus, lemma_to_tokens, language_code, strategy, outfile, processes, sentences_key, document_key, match_base_form): """ Extract corpus sentences containing at least one token in the given set. """ corpus = load_scraped_items(corpus) updated = extract_sentences(corpus, sentences_key, document_key, language_code, json.load(lemma_to_tokens), strategy, match_base_form, processes) for item in updated: outfile.write(json.dumps(item) + '\n') logger.info("Dumped sentences to '%s'" % outfile.name) return 0
def about_sources(corpus, processes, with_bio): """ Items' sources """ def worker(items): sources = defaultdict(int) for doc in items: url = doc.get('url') if not url: logger.warn('found an item without URL, name: %s, bio: %s', doc.get('name'), doc.get('bio', '')[:100] + ' ...') sources['_skipped_'] += 1 continue elif with_bio and len(doc.get('bio') or '') < 5: continue parsed = urlparse(url) if parsed.netloc: sources[parsed.netloc] += 1 else: logger.warn('cannot parse URL: %s', url) sources['_skipped_'] += 1 return sources aggregated_sources = defaultdict(int) corpus = parallel.make_batches(load_scraped_items(corpus), 1000) for sources in parallel.map(worker, corpus, processes): for k, v in sources.iteritems(): aggregated_sources[k] += v aggregated_sources = sorted(aggregated_sources.items(), key=lambda (_, v): v, reverse=True) for source, count in aggregated_sources: print source, count try: import matplotlib.pyplot as plt except ImportError: logger.warn('Cannot import matplotlib, skipping chart') return count = sum(c for s, c in aggregated_sources) display_sources = filter(lambda (s, v): float(v) / count >= 0.01, aggregated_sources) sources, values = map(list, zip(*display_sources)) sources.append('Rest') values.append(count - sum(values)) plt.pie(values, labels=sources) plt.axis('equal') plt.show()
def produce_lemma_tokens(pos_tagged_path, pos_tag_key, language): """ Extracts a map from lemma to all its tokens :param str pos_tagged_path: path of the pos-tagged corpus :param str pos_tag_key: where the pos tag data is in each item :param language: language of the corpus :return: mapping from lemma to tokens :rtype: dict """ corpus = load_scraped_items(pos_tagged_path) lemma_tokens = defaultdict(set) for item in corpus: for token, pos, lemma in item.get(pos_tag_key, []): if pos.startswith(VERBAL_PREFIXES[language]): lemma_tokens[lemma.lower()].add(token.lower()) return lemma_tokens
def preprocess_corpus(corpus_dir, document_key, output_dir, items_per_file, min_length): """ Remove items without text documents or whose text document is too short """ filename = 'corpus-%d.jsonlines' count = 0 current_file = open(os.path.join(output_dir, filename % 0), 'w') for item in load_scraped_items(corpus_dir): if item.get(document_key) and len(item[document_key]) > min_length: count += 1 if count % items_per_file == 0: fname = filename % (count / items_per_file) logger.info('processed %d items so far, continuing in %s' % (count, fname)) current_file.close() current_file = open(os.path.join(output_dir, fname), 'w') item['id'] = hashlib.sha1( item[document_key].encode('utf8')).hexdigest() json.dump(item, current_file) current_file.write('\n')
def main(corpus, document_key, pos_tag_key, language_code, tagger, outfile, tt_home, batch_size): """ Perform part-of-speech (POS) tagging over an input corpus. """ if tagger == 'tt': pos_tagger = TTPosTagger(language_code, tt_home) logger.info("About to perform part-of-speech tagging with TreeTagger ...") else: pos_tagger = NLTKPosTagger(language_code) logger.info("About to perform part-of-speech tagging with NLTK tagger ...") corpus = load_scraped_items(corpus) total = 0 for i, tagged_document in enumerate(pos_tagger.tag_many(corpus, document_key, pos_tag_key, batch_size)): total += 1 outfile.write(json.dumps(tagged_document) + '\n') if (i + 1) % 10000 == 0: logger.info('processed %d items', i + 1) logger.info("Done, total tagged items: %d" % total) return 0
def __init__(self, corpus_path, pos_tag_key): self.tags = self._flatten(item.get(pos_tag_key) for item in load_scraped_items(corpus_path))