def main(corpus, verbs, processes, outfile, sub_sentences): """ Compute the LU distribution in the corpus, i.e. how many LUs per sentence """ global splitter, tagger, parser, all_verbs splitter = PunktSentenceSplitter('en') tagger = TTPosTagger('en') parser = StanfordParser( path_to_jar='dev/stanford-corenlp-3.6.0.jar', path_to_models_jar='dev/stanford-corenlp-3.6.0-models.jar', java_options=' -mx1G -Djava.ext.dirs=dev/' ) # no way to make classpath work all_verbs = reduce(lambda x, y: x.union(y), imap(set, json.load(verbs).values()), set()) all_verbs.discard('be') all_verbs.discard('have') args = load_corpus(corpus, 'bio', text_only=True) worker = worker_with_sub_sentences if sub_sentences else worker_with_sentences counter = defaultdict(int) for i, counts in enumerate(parallel.map(worker, args, processes)): for k, v in counts.iteritems(): counter[k] += v if (i + 1) % 10000 == 0: logger.info('Processed %d documents', i + 1) counter = OrderedDict(sorted(counter.items(), key=lambda (k, v): k)) for k, v in counter.iteritems(): print k, v json.dump(counter, outfile, indent=2)
def main(corpus, verbs, processes, outfile, sub_sentences): """ Compute the LU distribution in the corpus, i.e. how many LUs per sentence """ global splitter, tagger, parser, all_verbs splitter = PunktSentenceSplitter("en") tagger = TTPosTagger("en") parser = StanfordParser( path_to_jar="dev/stanford-corenlp-3.6.0.jar", path_to_models_jar="dev/stanford-corenlp-3.6.0-models.jar", java_options=" -mx1G -Djava.ext.dirs=dev/", ) # no way to make classpath work all_verbs = reduce(lambda x, y: x.union(y), imap(set, json.load(verbs).values()), set()) all_verbs.discard("be") all_verbs.discard("have") args = load_corpus(corpus, "bio", text_only=True) worker = worker_with_sub_sentences if sub_sentences else worker_with_sentences counter = defaultdict(int) for i, counts in enumerate(parallel.map(worker, args, processes)): for k, v in counts.iteritems(): counter[k] += v if (i + 1) % 10000 == 0: logger.info("Processed %d documents", i + 1) counter = OrderedDict(sorted(counter.items(), key=lambda (k, v): k)) for k, v in counter.iteritems(): print k, v json.dump(counter, outfile, indent=2)
def compute_tf_idf_matrix(corpus_path, document_key): """ Computes the TF-IDF matrix of the corpus :param str corpus_path: path of the corpus :param str document_key: where the textual content is in the corpus :return: a vectorizer and the computed matrix :rtype: tuple """ corpus = load_corpus(corpus_path, document_key, text_only=True) vectorizer = TfidfVectorizer() return vectorizer, vectorizer.fit_transform(corpus)
def main(input_dir, document_key, language_code, outfile): """ Tokenize an input corpus. Sentence splitting is not performed. """ corpus = load_corpus(input_dir, document_key) t = Tokenizer(language_code) logger.info("Starting Tokenization of the input corpus ...") for i, document in enumerate(corpus): tokens = t.tokenize(document) outfile.write(json.dumps({i: tokens}, indent=2) + '\n') return 0
def main(corpus, document_key, language_code, outfile, processes): """ Split an input corpus into sentences """ corpus = load_corpus(corpus, document_key, text_only=True) s = PunktSentenceSplitter(language_code) logger.info("Starting sentence splitting of the input corpus ...") def worker((i, text)): sentences = list(s.split(text)) return json.dumps({i: sentences}) if sentences else None for sentences in parallel.map(worker, enumerate(corpus), processes): outfile.write(sentences) outfile.write('\n') return 0