def main(corpus, verbs, processes, outfile, sub_sentences):
    """ Compute the LU distribution in the corpus, i.e. how many LUs per sentence
    """
    global splitter, tagger, parser, all_verbs
    splitter = PunktSentenceSplitter('en')
    tagger = TTPosTagger('en')
    parser = StanfordParser(
        path_to_jar='dev/stanford-corenlp-3.6.0.jar',
        path_to_models_jar='dev/stanford-corenlp-3.6.0-models.jar',
        java_options=' -mx1G -Djava.ext.dirs=dev/'
    )  # no way to make classpath work
    all_verbs = reduce(lambda x, y: x.union(y),
                       imap(set,
                            json.load(verbs).values()), set())
    all_verbs.discard('be')
    all_verbs.discard('have')

    args = load_corpus(corpus, 'bio', text_only=True)
    worker = worker_with_sub_sentences if sub_sentences else worker_with_sentences
    counter = defaultdict(int)

    for i, counts in enumerate(parallel.map(worker, args, processes)):
        for k, v in counts.iteritems():
            counter[k] += v

        if (i + 1) % 10000 == 0:
            logger.info('Processed %d documents', i + 1)

    counter = OrderedDict(sorted(counter.items(), key=lambda (k, v): k))
    for k, v in counter.iteritems():
        print k, v

    json.dump(counter, outfile, indent=2)
def main(corpus, verbs, processes, outfile, sub_sentences):
    """ Compute the LU distribution in the corpus, i.e. how many LUs per sentence
    """
    global splitter, tagger, parser, all_verbs
    splitter = PunktSentenceSplitter("en")
    tagger = TTPosTagger("en")
    parser = StanfordParser(
        path_to_jar="dev/stanford-corenlp-3.6.0.jar",
        path_to_models_jar="dev/stanford-corenlp-3.6.0-models.jar",
        java_options=" -mx1G -Djava.ext.dirs=dev/",
    )  # no way to make classpath work
    all_verbs = reduce(lambda x, y: x.union(y), imap(set, json.load(verbs).values()), set())
    all_verbs.discard("be")
    all_verbs.discard("have")

    args = load_corpus(corpus, "bio", text_only=True)
    worker = worker_with_sub_sentences if sub_sentences else worker_with_sentences
    counter = defaultdict(int)

    for i, counts in enumerate(parallel.map(worker, args, processes)):
        for k, v in counts.iteritems():
            counter[k] += v

        if (i + 1) % 10000 == 0:
            logger.info("Processed %d documents", i + 1)

    counter = OrderedDict(sorted(counter.items(), key=lambda (k, v): k))
    for k, v in counter.iteritems():
        print k, v

    json.dump(counter, outfile, indent=2)
Example #3
0
def compute_tf_idf_matrix(corpus_path, document_key):
    """ Computes the TF-IDF matrix of the corpus

        :param str corpus_path: path of the corpus
        :param str document_key: where the textual content is in the corpus
        :return: a vectorizer and the computed matrix
        :rtype: tuple
    """
    corpus = load_corpus(corpus_path, document_key, text_only=True)
    vectorizer = TfidfVectorizer()
    return vectorizer, vectorizer.fit_transform(corpus)
Example #4
0
def main(input_dir, document_key, language_code, outfile):
    """ Tokenize an input corpus.
        Sentence splitting is not performed.
    """
    corpus = load_corpus(input_dir, document_key)
    t = Tokenizer(language_code)
    logger.info("Starting Tokenization of the input corpus ...")
    for i, document in enumerate(corpus):
        tokens = t.tokenize(document)
        outfile.write(json.dumps({i: tokens}, indent=2) + '\n')
    return 0
Example #5
0
def main(corpus, document_key, language_code, outfile, processes):
    """ Split an input corpus into sentences """
    corpus = load_corpus(corpus, document_key, text_only=True)
    s = PunktSentenceSplitter(language_code)

    logger.info("Starting sentence splitting of the input corpus ...")

    def worker((i, text)):
        sentences = list(s.split(text))
        return json.dumps({i: sentences}) if sentences else None

    for sentences in parallel.map(worker, enumerate(corpus), processes):
        outfile.write(sentences)
        outfile.write('\n')

    return 0
Example #6
0
def main(corpus, document_key, language_code, outfile, processes):
    """ Split an input corpus into sentences """
    corpus = load_corpus(corpus, document_key, text_only=True)
    s = PunktSentenceSplitter(language_code)

    logger.info("Starting sentence splitting of the input corpus ...")

    def worker((i, text)):
        sentences = list(s.split(text))
        return json.dumps({i: sentences}) if sentences else None

    for sentences in parallel.map(worker, enumerate(corpus), processes):
        outfile.write(sentences)
        outfile.write('\n')

    return 0