def main(corpus, verbs, processes, outfile, sub_sentences):
    """ Compute the LU distribution in the corpus, i.e. how many LUs per sentence
    """
    global splitter, tagger, parser, all_verbs
    splitter = PunktSentenceSplitter('en')
    tagger = TTPosTagger('en')
    parser = StanfordParser(
        path_to_jar='dev/stanford-corenlp-3.6.0.jar',
        path_to_models_jar='dev/stanford-corenlp-3.6.0-models.jar',
        java_options=' -mx1G -Djava.ext.dirs=dev/'
    )  # no way to make classpath work
    all_verbs = reduce(lambda x, y: x.union(y),
                       imap(set,
                            json.load(verbs).values()), set())
    all_verbs.discard('be')
    all_verbs.discard('have')

    args = load_corpus(corpus, 'bio', text_only=True)
    worker = worker_with_sub_sentences if sub_sentences else worker_with_sentences
    counter = defaultdict(int)

    for i, counts in enumerate(parallel.map(worker, args, processes)):
        for k, v in counts.iteritems():
            counter[k] += v

        if (i + 1) % 10000 == 0:
            logger.info('Processed %d documents', i + 1)

    counter = OrderedDict(sorted(counter.items(), key=lambda (k, v): k))
    for k, v in counter.iteritems():
        print k, v

    json.dump(counter, outfile, indent=2)
Example #2
0
    def setup_extractor(self):
        self.splitter = PunktSentenceSplitter(self.language)

        self.all_verb_tokens = set()
        self.token_to_lemma = {}
        for lemma, match_tokens in self.lemma_to_token.iteritems():
            for match_token in match_tokens:
                self.all_verb_tokens.add(match_token.lower())
                self.token_to_lemma[match_token.lower()] = lemma
        logger.debug("All match tokens: %s" % self.all_verb_tokens)
Example #3
0
    def setup_extractor(self):
        self.splitter = PunktSentenceSplitter(self.language)
        self.parser = StanfordParser(
            path_to_jar='dev/stanford-corenlp-3.6.0.jar',
            path_to_models_jar='dev/stanford-corenlp-3.6.0-models.jar',
            java_options=' -mx2G -Djava.ext.dirs=dev/')

        self.token_to_lemma = {}
        for lemma, tokens in self.lemma_to_token.iteritems():
            for t in tokens:
                self.token_to_lemma[t] = lemma
        self.all_verbs = set(self.token_to_lemma.keys())
Example #4
0
    def setup_extractor(self):
        self.splitter = PunktSentenceSplitter(self.language)
        grammar = self.grammars.get(self.language)
        if grammar:
            self.parser = RegexpParser(grammar)
        else:
            raise ValueError(
                "Invalid or unsupported language: '%s'. Please use one of the currently supported ones: %s"
                % (self.language, self.grammars.keys()))

        for lemma, match_tokens in self.lemma_to_token.iteritems():
            self.lemma_to_token[lemma] = set(
                [match.lower() for match in match_tokens])
Example #5
0
 def setup_extractor(self):
     self.splitter = PunktSentenceSplitter(self.language)