コード例 #1
0
ファイル: entity_linking.py プロジェクト: rpatil524/StrepHit
def main(sentences, language, outfile, confidence, processes):
    """ Perform entity linking over a set of input sentences.
        The service is Dandelion Entity Extraction API:
        https://dandelion.eu/docs/api/datatxt/nex/v1/ .
        Links having confidence score below the given
        threshold are discarded.
    """
    def worker(row):
        sentence = json.loads(row)
        text = sentence.get('text')
        if text:
            sentence['linked_entities'] = link(text, confidence, language)
            return json.dumps(sentence)

    count = 0
    for each in parallel.map(worker, sentences, processes):
        outfile.write(each)
        outfile.write('\n')

        count += 1
        if count % 1000 == 0:
            logger.info('Linked %d sentences', count)
    if count > 0:
        logger.info("Dumped linked sentences to '%s'" % outfile.name)
    logger.info('Done, linked %d sentences', count)
コード例 #2
0
ファイル: classify.py プロジェクト: nooralahzadeh/StrepHit
def main(sentences, model, language, outfile, processes, gazetteer):
    gazetteer = reverse_gazetteer(json.load(gazetteer)) if gazetteer else {}

    logger.info("Loading model from '%s' ...", model)
    model, extractor = joblib.load(model)

    classifier = SentenceClassifier(model, extractor, language, gazetteer)

    def worker(batch):
        data = (json.loads(s) for s in batch)
        for classified in classifier.classify_sentences(data):
            yield json.dumps(classified)

    count = 0
    for each in parallel.map(worker, sentences, batch_size=1000,
                             flatten=True, processes=processes):
        outfile.write(each)
        outfile.write('\n')

        count += 1
        if count % 1000 == 0:
            logger.info('Classified %d sentences', count)

    logger.info('Done, classified %d sentences', count)
    if count > 0:
        logger.info("Dumped classified sentences to '%s'", outfile.name)
コード例 #3
0
def main(corpus, verbs, processes, outfile, sub_sentences):
    """ Compute the LU distribution in the corpus, i.e. how many LUs per sentence
    """
    global splitter, tagger, parser, all_verbs
    splitter = PunktSentenceSplitter('en')
    tagger = TTPosTagger('en')
    parser = StanfordParser(
        path_to_jar='dev/stanford-corenlp-3.6.0.jar',
        path_to_models_jar='dev/stanford-corenlp-3.6.0-models.jar',
        java_options=' -mx1G -Djava.ext.dirs=dev/'
    )  # no way to make classpath work
    all_verbs = reduce(lambda x, y: x.union(y),
                       imap(set,
                            json.load(verbs).values()), set())
    all_verbs.discard('be')
    all_verbs.discard('have')

    args = load_corpus(corpus, 'bio', text_only=True)
    worker = worker_with_sub_sentences if sub_sentences else worker_with_sentences
    counter = defaultdict(int)

    for i, counts in enumerate(parallel.map(worker, args, processes)):
        for k, v in counts.iteritems():
            counter[k] += v

        if (i + 1) % 10000 == 0:
            logger.info('Processed %d documents', i + 1)

    counter = OrderedDict(sorted(counter.items(), key=lambda (k, v): k))
    for k, v in counter.iteritems():
        print k, v

    json.dump(counter, outfile, indent=2)
コード例 #4
0
    def process_corpus(self, items, output_file, dump_unresolved_file=None, genealogics=None, processes=0):
        count = skipped = 0

        genealogics_url_to_id = {}
        for success, item in parallel.map(self.serialize_item, items, processes, flatten=True):
            if success:
                subj, prop, val, url = item
                statement = wikidata.finalize_statement(
                    subj, prop, val, self.language, url,
                    resolve_property=False, resolve_value=False
                )

                if not statement:
                    continue

                output_file.write(statement.encode('utf8'))
                output_file.write('\n')

                if genealogics and url.startswith('http://www.genealogics.org/'):
                    genealogics_url_to_id[url] = subj

                count += 1
                if count % 10000 == 0:
                    logger.info('Produced %d statements so far, skipped %d names', count, skipped)
            else:
                skipped += 1
                if dump_unresolved_file:
                    dump_unresolved_file.write(json.dumps(item))
                    dump_unresolved_file.write('\n')

        logger.info('Produced %d statements so far, skipped %d names', count, skipped)
        return genealogics_url_to_id, count, skipped
コード例 #5
0
ファイル: entity_linking.py プロジェクト: Wikidata/StrepHit
def main(sentences, language, outfile, confidence, processes):
    """ Perform entity linking over a set of input sentences.
        The service is Dandelion Entity Extraction API:
        https://dandelion.eu/docs/api/datatxt/nex/v1/ .
        Links having confidence score below the given
        threshold are discarded.
    """

    def worker(row):
        sentence = json.loads(row)
        text = sentence.get('text')
        if text:
            sentence['linked_entities'] = link(text, confidence, language)
            return json.dumps(sentence)

    count = 0
    for each in parallel.map(worker, sentences, processes):
        outfile.write(each)
        outfile.write('\n')

        count += 1
        if count % 1000 == 0:
            logger.info('Linked %d sentences', count)
    if count > 0:
        logger.info("Dumped linked sentences to '%s'" % outfile.name)
    logger.info('Done, linked %d sentences', count)
コード例 #6
0
    def label_sentences(self, sentences, normalize_numerical, score_type, core_weight,
                        processes=0, input_encoded=False, output_encoded=False):
        """ Process all the given sentences with the rule-based classifier,
            optionally giving a confidence score

            :param sentences: List of sentence data
            :param normalize_numerical: Whether to automatically
             normalize numerical expressions
            :param score_type: Which type of score (if any) to use to
             compute the classification confidence
            :param core_weight: Weight of the core FEs (used in the scoring)
            :param processes: how many processes to use to concurrently label sentences
            :param input_encoded: whether the corpus is an iterable of dictionaries or an
             iterable of JSON-encoded documents. JSON-encoded documents are preferable
             over large size dictionaries for performance reasons
            :param output_encoded: whether to return a generator of dictionaries or a generator
             of JSON-encoded documents. Prefer encoded output for performance reasons
            :return: Generator of labeled sentences
        """

        def worker(item):
            if input_encoded:
                item = json.loads(item)

            labeled = self.label_sentence(item, normalize_numerical,
                                          score_type, core_weight)

            if labeled:
                return json.dumps(labeled) if output_encoded else labeled

        for each in parallel.map(worker, sentences, processes):
            yield each
コード例 #7
0
def lu_count(sentences, processes=0, input_encoded=False):
    """ Count how many sentences per LU there are for each source

        :param iterable sentences: Corpus with the POS-tagged sentences
        :param int processes: how many processes to use for parallel execution
        :param bool input_encoded: whether the corpus is an iterable of dictionaries
         or an iterable of JSON-input_encoded documents. JSON-input_encoded
         documents are preferable over large size dictionaries for performance reasons
        :return: A dictionary source -> frequencies, where frequencies is
         another dictionary lemma -> count
        :type: bool
    """
    def worker(batch):
        freqs = defaultdict(lambda: 0)
        for row in batch:
            sentence = json.loads(row) if input_encoded else row

            parsed = urlparse(sentence['url'])
            if not parsed.netloc:
                logger.warn('cannot parse URL: %s', sentence['url'])
                return

            lu = sentence['lu']
            freqs[(parsed.netloc, lu)] += 1

        return freqs.items()

    frequencies = defaultdict(lambda: defaultdict(lambda: 0))
    for (source, lemma), count in parallel.map(worker,
                                               sentences,
                                               processes,
                                               batch_size=100,
                                               flatten=True):
        frequencies[source][lemma] += count
    return frequencies
コード例 #8
0
def main(corpus, verbs, processes, outfile, sub_sentences):
    """ Compute the LU distribution in the corpus, i.e. how many LUs per sentence
    """
    global splitter, tagger, parser, all_verbs
    splitter = PunktSentenceSplitter("en")
    tagger = TTPosTagger("en")
    parser = StanfordParser(
        path_to_jar="dev/stanford-corenlp-3.6.0.jar",
        path_to_models_jar="dev/stanford-corenlp-3.6.0-models.jar",
        java_options=" -mx1G -Djava.ext.dirs=dev/",
    )  # no way to make classpath work
    all_verbs = reduce(lambda x, y: x.union(y), imap(set, json.load(verbs).values()), set())
    all_verbs.discard("be")
    all_verbs.discard("have")

    args = load_corpus(corpus, "bio", text_only=True)
    worker = worker_with_sub_sentences if sub_sentences else worker_with_sentences
    counter = defaultdict(int)

    for i, counts in enumerate(parallel.map(worker, args, processes)):
        for k, v in counts.iteritems():
            counter[k] += v

        if (i + 1) % 10000 == 0:
            logger.info("Processed %d documents", i + 1)

    counter = OrderedDict(sorted(counter.items(), key=lambda (k, v): k))
    for k, v in counter.iteritems():
        print k, v

    json.dump(counter, outfile, indent=2)
コード例 #9
0
ファイル: test_commons.py プロジェクト: rpatil524/StrepHit
    def test_batches(self):
        def consumer(bulk):
            self.assertEqual(len(bulk), batch_size)
            return True

        for batch_size in range(1, 10, 2):
            data = range(batch_size * 5)
            self.assertTrue(all(parallel.map(consumer, data, processes=5, batch_size=batch_size)))
コード例 #10
0
def about_sources(corpus, processes, with_bio):
    """ Items' sources
    """
    def worker(items):
        sources = defaultdict(int)
        for doc in items:
            url = doc.get('url')
            if not url:
                logger.warn('found an item without URL, name: %s, bio: %s',
                            doc.get('name'),
                            doc.get('bio', '')[:100] + ' ...')
                sources['_skipped_'] += 1
                continue
            elif with_bio and len(doc.get('bio') or '') < 5:
                continue

            parsed = urlparse(url)
            if parsed.netloc:
                sources[parsed.netloc] += 1
            else:
                logger.warn('cannot parse URL: %s', url)
                sources['_skipped_'] += 1
        return sources

    aggregated_sources = defaultdict(int)
    corpus = parallel.make_batches(load_scraped_items(corpus), 1000)
    for sources in parallel.map(worker, corpus, processes):
        for k, v in sources.iteritems():
            aggregated_sources[k] += v

    aggregated_sources = sorted(aggregated_sources.items(),
                                key=lambda (_, v): v,
                                reverse=True)
    for source, count in aggregated_sources:
        print source, count

    try:
        import matplotlib.pyplot as plt
    except ImportError:
        logger.warn('Cannot import matplotlib, skipping chart')
        return

    count = sum(c for s, c in aggregated_sources)
    display_sources = filter(lambda (s, v): float(v) / count >= 0.01,
                             aggregated_sources)
    sources, values = map(list, zip(*display_sources))
    sources.append('Rest')
    values.append(count - sum(values))
    plt.pie(values, labels=sources)
    plt.axis('equal')
    plt.show()
コード例 #11
0
def extract_sentences(sentences,
                      probabilities,
                      processes=0,
                      input_encoded=False,
                      output_encoded=False):
    """ Extracts some sentences from the corpus following the given probabilities

        :param iterable sentences: Extracted sentences
        :param dict probabilities: Conditional probabilities of extracting a sentence containing
         a specific LU given the source of the sentence. It is therefore a mapping
         source -> probabilities, where probabilities is itself a mapping LU -> probability
        :param int processes: how many processes to use for parallel execution
        :param bool input_encoded: whether the corpus is an iterable of dictionaries or an
         iterable of JSON-encoded documents. JSON-encoded documents are preferable
         over large size dictionaries for performance reasons
        :param bool output_encoded: whether to return a generator of dictionaries or a generator
         of JSON-encoded documents. Prefer encoded output for performance reasons
        :return: Generator of sentences
    """
    def worker(batch):
        for row in batch:
            sentence = json.loads(row) if input_encoded else row
            parsed = urlparse(sentence['url'])
            if not parsed.netloc:
                logger.warn('cannot parse URL: %s', sentence['url'])
                return

            lu = sentence['lu']
            p = probabilities[(parsed.netloc, lu)]

            if random.random() < p:
                yield parsed.netloc, lu, json.dumps(
                    sentence) if output_encoded else sentence

    counts = defaultdict(lambda: 0)
    for source, lu, sentence in parallel.map(worker,
                                             sentences,
                                             processes,
                                             batch_size=100,
                                             flatten=True):
        counts[(source, lu)] += 1
        yield sentence

    aggs_lu = defaultdict(lambda: 0)
    aggs_source = defaultdict(lambda: 0)
    for (source, lu), n in counts.iteritems():
        aggs_lu[lu] += n
        aggs_source[source] += n

    logger.debug('aggregated statistics per LU: %s', aggs_lu)
    logger.debug('aggregated statistics per source: %s', aggs_source)
コード例 #12
0
    def find_ranking(self, processes=0):
        """ Ranks the verbs

            :param int processes: How many processes to use for parallel ranking
            :return: tuple with average tf-idf and average standard deviation ordered rankings
            :rtype: tuple of (OrderedDict, OrderedDict)
        """
        tfidf_ranking = {}
        stdev_ranking = {}
        for lemma, tfidf, stdev in parallel.map(self.score_lemma, self.verbs, processes):
            tfidf_ranking[lemma] = tfidf
            stdev_ranking[lemma] = stdev
        return (OrderedDict(sorted(tfidf_ranking.items(), key=lambda x: x[1], reverse=True)),
                OrderedDict(sorted(stdev_ranking.items(), key=lambda x: x[1], reverse=True)))
コード例 #13
0
ファイル: split_sentences.py プロジェクト: Wikidata/StrepHit
def main(corpus, document_key, language_code, outfile, processes):
    """ Split an input corpus into sentences """
    corpus = load_corpus(corpus, document_key, text_only=True)
    s = PunktSentenceSplitter(language_code)

    logger.info("Starting sentence splitting of the input corpus ...")

    def worker((i, text)):
        sentences = list(s.split(text))
        return json.dumps({i: sentences}) if sentences else None

    for sentences in parallel.map(worker, enumerate(corpus), processes):
        outfile.write(sentences)
        outfile.write('\n')

    return 0
コード例 #14
0
def main(corpus, document_key, language_code, outfile, processes):
    """ Split an input corpus into sentences """
    corpus = load_corpus(corpus, document_key, text_only=True)
    s = PunktSentenceSplitter(language_code)

    logger.info("Starting sentence splitting of the input corpus ...")

    def worker((i, text)):
        sentences = list(s.split(text))
        return json.dumps({i: sentences}) if sentences else None

    for sentences in parallel.map(worker, enumerate(corpus), processes):
        outfile.write(sentences)
        outfile.write('\n')

    return 0
コード例 #15
0
    def find_ranking(self, processes=0, bulk_size=10000, normalize=True):
        ranking = defaultdict(int)
        for score in parallel.map(self.score_from_tokens,
                                  self._bulkenize(self.tags, bulk_size),
                                  processes):

            for k, v in score.iteritems():
                ranking[k] += v

        ranking = OrderedDict(sorted(ranking.items(), key=lambda x: x[1], reverse=True))

        if normalize:
            max_score = float(ranking[next(iter(ranking))])
            for lemma, score in ranking.iteritems():
                ranking[lemma] = score / max_score

        return ranking
コード例 #16
0
ファイル: serialize.py プロジェクト: nooralahzadeh/StrepHit
def main(classified, lexical_db, outfile, language, semistructured, processes,
         dump_unresolved):
    """ Serialize classification results into quickstatements
    """

    if semistructured:
        url_to_wid = map_url_to_wid(semistructured)
        logger.info(
            'Used semi-structured dataset to infer %d Wikidata Item IDs',
            len(url_to_wid))
    else:
        url_to_wid = {}
        logger.info('TIP: using the semi-structured dataset could help in '
                    'resolving the Wikidata Item ID of more subjects')

    lexical_db = json.load(lexical_db)

    count = skipped = 0
    serializer = ClassificationSerializer(language, lexical_db, url_to_wid)
    for success, item in parallel.map(serializer.to_statements,
                                      classified,
                                      processes=processes,
                                      flatten=True):
        if success:
            outfile.write(item.encode('utf8'))
            outfile.write('\n')

            count += 1
        else:
            skipped += 1
            if dump_unresolved:
                dump_unresolved.write(json.dumps(item))
                dump_unresolved.write('\n')

        if count % 1000 == 0 and count > 0:
            logger.info('Produced %d statements so far, skipped %d names',
                        count, skipped)

    logger.info('Done, produced %d statements, skipped %d names', count,
                skipped)
    logger.info("Dataset serialized to '%s'" % outfile.name)
    if dump_unresolved:
        logger.info("Unresolved entities dumped to '%s'" %
                    dump_unresolved.name)
コード例 #17
0
def main(classified, lexical_db, outfile, language,
         semistructured, processes, dump_unresolved):
    """ Serialize classification results into quickstatements
    """

    if semistructured:
        url_to_wid = map_url_to_wid(semistructured)
        logger.info('Used semi-structured dataset to infer %d Wikidata Item IDs',
                    len(url_to_wid))
    else:
        url_to_wid = {}
        logger.info('TIP: using the semi-structured dataset could help in '
                    'resolving the Wikidata Item ID of more subjects')

    lexical_db = json.load(lexical_db)

    count = skipped = 0
    serializer = ClassificationSerializer(language, lexical_db, url_to_wid)
    for success, item in parallel.map(serializer.to_statements, classified,
                                       processes=processes, flatten=True):
        if success:
            outfile.write(item.encode('utf8'))
            outfile.write('\n')

            count += 1
        else:
            skipped += 1
            if dump_unresolved:
                dump_unresolved.write(json.dumps(item))
                dump_unresolved.write('\n')

        if count % 1000 == 0 and count > 0:
            logger.info('Produced %d statements so far, skipped %d names', count, skipped)

    logger.info('Done, produced %d statements, skipped %d names', count, skipped)
    logger.info("Dataset serialized to '%s'" % outfile.name)
    if dump_unresolved:
        logger.info("Unresolved entities dumped to '%s'" % dump_unresolved.name)
コード例 #18
0
    def extract(self, processes=0):
        """ Processes the corpus extracting sentences from each item
            and storing them in the item itself.

            :param int processes: how many processes to use for parallel tagging
            :return: the extracted sentences
            :type: generator of dicts
        """
        self.setup_extractor()

        try:
            count = 0
            for i, (item, extracted) in enumerate(
                    parallel.map(self.extract_from_item, self.corpus,
                                 processes)):

                if not item.get('name') or not item.get('url'):
                    logger.warn('Skipping item without name or URL')
                    continue

                # assign an unique incremental ID to each sentence
                # and store information about the originating document
                for each in extracted:
                    each['id'] = count
                    each['url'] = item['url']
                    each['name'] = item['name']
                    count += 1

                    yield each

                if (i + 1) % 10000 == 0:
                    logger.info('Processed %d items, extracted %d sentences',
                                i + 1, count)

            logger.info('Done, total sentences extracted: %d', count)
        finally:
            self.teardown_extractor()
コード例 #19
0
    def extract(self, processes=0):
        """ Processes the corpus extracting sentences from each item
            and storing them in the item itself.

            :param int processes: how many processes to use for parallel tagging
            :return: the extracted sentences
            :type: generator of dicts
        """
        self.setup_extractor()

        try:
            count = 0
            for i, (item, extracted) in enumerate(parallel.map(self.extract_from_item,
                                                               self.corpus, processes)):

                if not item.get('name') or not item.get('url'):
                    logger.warn('Skipping item without name or URL')
                    continue

                # assign an unique incremental ID to each sentence
                # and store information about the originating document
                for each in extracted:
                    each['id'] = count
                    each['url'] = item['url']
                    each['name'] = item['name']
                    count += 1

                    yield each

                if (i + 1) % 10000 == 0:
                    logger.info('Processed %d items, extracted %d sentences',
                                i + 1, count)

            logger.info('Done, total sentences extracted: %d', count)
        finally:
            self.teardown_extractor()
コード例 #20
0
ファイル: test_commons.py プロジェクト: rpatil524/StrepHit
 def test_exception_single(self):
     self.assertRaises(ValueError, self.consume,
                       parallel.map(self.exc_function, self.list_in, processes=1,
                                    raise_exc=True))
コード例 #21
0
ファイル: test_commons.py プロジェクト: rpatil524/StrepHit
 def test_flatten_multi_process(self):
     list_out = parallel.map(self.multi_function, self.list_in, processes=2,
                             flatten=True)
     self.assertEqual(Counter(list_out), Counter(self.correct_multi))
コード例 #22
0
ファイル: test_commons.py プロジェクト: rpatil524/StrepHit
 def test_more_workers(self):
     list_out = set(parallel.map(self.function, self.list_in, processes=20))
     self.assertEqual(list_out, self.correct)
コード例 #23
0
ファイル: test_commons.py プロジェクト: rpatil524/StrepHit
 def test_with_nones_multi_process(self):
     list_out = set(parallel.map(self.function, self.list_in_nones, processes=1))
     self.assertEqual(list_out, self.correct_nones)
コード例 #24
0
ファイル: test_commons.py プロジェクト: rpatil524/StrepHit
 def test_multi_process(self):
     list_out = set(parallel.map(self.function, self.list_in, processes=2))
     self.assertEqual(list_out, self.correct)