Ejemplo n.º 1
0
def create_cluster_obj(cluster_id, cluster_type, mentions, default_label,
                       default_facet):
    nlp = get_item("spacy")

    ents_counter = Counter()
    pos_counter = Counter()
    labels_to_mentions = defaultdict(list)
    pos_to_mentions = defaultdict(list)
    unique_sents_ids = set()
    for mention in mentions:
        clean_mention = clean_text(mention.token)

        doc = nlp(mention.token)
        label = "NO_LABEL"
        pos = "NO_LABEL"
        for ent in doc.ents:
            # Only if the whole string is an entity
            if clean_text(ent.text) == clean_mention:
                label = ent.label_

        for token in doc:
            if clean_text(token.text) == clean_mention:
                pos = token.pos_

        ents_counter[label] += 1
        pos_counter[pos] += 1
        labels_to_mentions[label].append(mention)
        pos_to_mentions[pos].append(mention)
        unique_sents_ids.add(f"{mention.doc_id} {mention.sent_idx}")

    most_representative_mention, ner_label = _choose_most_representative_mention(
        mentions, ents_counter, labels_to_mentions)

    cluster_label = LABELS_MAP.get(ner_label, default_label)
    cluster_facet = FACETS_MAP.get(ner_label, default_facet)

    pos_label = None
    if any(pos_counter):
        pos_label = pos_counter.most_common()[0][0]

    return Cluster(cluster_id, cluster_type, mentions, pos_label,
                   cluster_label, cluster_facet, most_representative_mention,
                   len(mentions), len(unique_sents_ids))
Ejemplo n.º 2
0
    def _initSummarySpacyObject(self):
        nlp = get_item("spacy")
        # get the top summary sentences per document (to cut time significantly for the full corpus processing):
        perDocSummTexts = []
        for docIdx, doc in enumerate(self.corpus.documents):
            #docObj = doc.spacyDoc
            docSumm = ''
            #for sent in docObj._.textrank.summary(limit_phrases=20, limit_sentences=3):

            for sentText in doc.topSentencesText:
                if sentText.strip()[-1] != '.':
                    docSumm += sentText.strip() + '. '
                else:
                    docSumm += sentText.strip() + ' '
            perDocSummTexts.append(docSumm)

            #docObj = doc.spacyDoc
            #docSumm = ' '.join([sent.text for sent in docObj._.textrank.summary(limit_phrases=20, limit_sentences=3)])
            #perDocSummTexts.append(docSumm)

        # create a SpaCy object for the concatenated summaries of all the documents:
        return nlp(' '.join(perDocSummTexts))
Ejemplo n.º 3
0
    def _initDoc(self):
        nlp = get_item("spacy")
        self.spacyDoc = nlp(self.text)
        self.tokens = [t.text for t in self.spacyDoc]
        self.topSentencesText = [
            sent.text for sent in self.spacyDoc._.textrank.summary(
                limit_phrases=20, limit_sentences=NUMBER_OF_TOP_SENTENCES_KEPT)
        ]

        # sentence tokenization done with SpaCy - for consistency within all variants

        if self.representationStyle == REPRESENTATION_STYLE_SPACY:
            # since it is time consuming to compute Spacy objects per sentence, we pass in the sentence
            # vector representation per sentence:
            self.sentences = []
            for sentIdx, sentSpacyObj in enumerate(self.spacyDoc.sents):
                doNotInitRepresentation = True
                self.sentences.append(
                    Sentence(self.id,
                             sentIdx,
                             sentSpacyObj.text,
                             self.representationStyle,
                             doNotInitRepresentation=doNotInitRepresentation,
                             spacy_rep=sentSpacyObj))
                if doNotInitRepresentation:
                    self.sentences[-1].setRepresentation(sentSpacyObj.vector)

        # in all other cases, and as it should be for correct code, the representations are computed
        # within the Sentence object:
        else:
            self.sentences = [
                Sentence(self.id,
                         sentIdx,
                         sentSpacyObj.text,
                         self.representationStyle,
                         spacy_rep=sentSpacyObj)
                for sentIdx, sentSpacyObj in enumerate(self.spacyDoc.sents)
            ]
Ejemplo n.º 4
0
 def __initRepresentation(self):
     nlp = get_item("spacy")
     text = self.text
     text = "".join([
         x.text_with_ws for x in nlp(text)
         if x.text not in STOP_WORDS and x.text.lower() not in PUNCTUATION
     ])
     if self.representationStyle == REPRESENTATION_STYLE_SPACY:
         self.representation = nlp(text).vector  # a spacy doc object
     elif self.representationStyle == REPRESENTATION_STYLE_BERT:
         self.representation = bert_embedder.encode([text
                                                     ])[0]  # a numpy vector
     elif self.representationStyle == REPRESENTATION_STYLE_W2V:  # default for now is W2V
         wordVectors = [
             nlp.vocab.get_vector(w) for w in self.tokens
             if w not in STOP_WORDS and w not in PUNCTUATION
             and nlp.vocab.has_vector(w)
         ]
         if len(wordVectors) > 0:
             self.representation = np.mean(wordVectors, axis=0)
         else:
             self.representation = np.random.uniform(-1, 1, (300, ))
     else:
         self.representation = None
Ejemplo n.º 5
0
from dataclasses import dataclass
from typing import Optional, Set, Dict, List

from QFSE.Corpus import Corpus
import data.Config as config
from QFSE.Utilities import REPRESENTATION_STYLE_SPACY, REPRESENTATION_STYLE_BERT, get_item, loadBert
from QFSE.coref.models import Mention
from QFSE.coref.utils import convert_corpus_to_coref_input_format, get_coref_clusters
from QFSE.models import DocSent, Cluster, ClusterUserWrapper
from QFSE.propositions.utils import get_proposition_clusters
from QFSE.consts import COREF_TYPE_EVENTS, COREF_TYPE_PROPOSITIONS, COREF_TYPE_ENTITIES

# The SpaCy and BERT objects must be loaded before anything else, so that classes using them get the initialized objects.
# The SpaCy and BERT objects are initialized only when needed since these init processes take a long time.
REPRESENTATION_STYLE = REPRESENTATION_STYLE_SPACY  # REPRESENTATION_STYLE_W2V REPRESENTATION_STYLE_BERT
get_item("spacy")
if REPRESENTATION_STYLE == REPRESENTATION_STYLE_BERT:
    loadBert()


class CorpusRegistry:
    def __init__(self):
        self._registry = {}

    def get_corpus(self, topicId) -> Optional[Corpus]:
        if topicId not in self._registry:
            # make sure the topic ID is valid:
            if topicId in config.CORPORA_LOCATIONS:
                referenceSummsFolder = os.path.join(
                    config.CORPORA_LOCATIONS[topicId],
                    config.CORPUS_REFSUMMS_RELATIVE_PATH)
Ejemplo n.º 6
0
    def getQuerySummaryJson(self, clientJson):
        clientId = clientJson['clientId']
        topicId = clientJson['request_query']['topicId']
        clusters_query = self._get_clusters_query_from_request(
            clientJson['request_query'])
        query = clientJson['request_query']['query']

        if not m_infoManager.clientInitialized(clientId):
            return self.getErrorJson('Unknown client. Please reload page.')

        if topicId != m_infoManager.getTopicId(clientId):
            return self.getErrorJson(
                'Topic ID not yet initialized by client: {}'.format(topicId))

        reply_query = {}

        corpus_registry: CorpusRegistry = get_item("corpus_registry")
        corpus: Corpus = corpus_registry.get_corpus(topicId)
        doc_sent_indices: Optional[Set[DocSent]] = None

        query_result_wrapper = None

        if clusters_query:
            query_registry: QueryRegistry = get_item("query_registry")
            query_result = query_registry.get_query(clusters_query)
            query_results_analyzer = m_infoManager.get_query_results_analyzer(
                clientId)

            if query_result is None:
                sentences = []
                doc_sent_indices = self._clusters_query_to_doc_sent_indices(
                    clusters_query, corpus)
                if any(doc_sent_indices):
                    doc_sent_indices_to_use = set.intersection(
                        *doc_sent_indices)
                    sentences = self._get_sentences_for_query(
                        doc_sent_indices_to_use, corpus)

                query_result = QueryResult([], clusters_query, [
                    QueryResultSentence(
                        self._split_sent_text_to_tokens(
                            sent, is_original_sentences=True), sent.docId,
                        sent.sentIndex) for sent in sentences
                ],
                                           datetime.utcnow().isoformat())
                if any(sentences):
                    if len(sentences) > 1:
                        summarizer = get_item("bart_summarizer")
                        summary_sents = summarizer.summarize(sentences)
                    else:
                        # No need to summarize one sentence
                        summary_sents = [sent.spacy_rep for sent in sentences]

                    query_result.result_sentences = [
                        QueryResultSentence(
                            self._split_sent_text_to_tokens(
                                sent,
                                is_original_sentences=False,
                                original_sentences=sentences))
                        for sent in summary_sents
                    ]

                query_registry.save_query(query_result)

            # Save queries and mark similar sentences to those used
            # query_results_analyzer.analyze_repeating(query_result)
            query_idx = query_results_analyzer.add_query_results(query_result)
            query_result_wrapper = QueryResultUserWrapper(
                query_result, query_idx)

            reply_query = {
                "queryResult": query_result_wrapper.custom_to_dict(),
                "textLength": 0,
            }

            doc_sent_indices = query_result.get_doc_sent_indices()

        m_infoManager.add_ui_action_log(
            clientId,
            UIAction(
                "query", {
                    "query_idx":
                    query_result_wrapper.query_idx
                    if query_result_wrapper is not None else None
                },
                datetime.utcnow().isoformat()))

        # Always return the clusters even if query is none
        reply_query = {
            **reply_query,
            **{
                "corefClustersMetas":
                get_clusters_filtered(
                    corpus.coref_clusters[COREF_TYPE_ENTITIES], doc_sent_indices),
                "eventsClustersMetas":
                get_clusters_filtered(corpus.coref_clusters[COREF_TYPE_EVENTS], doc_sent_indices),
                "propositionClustersMetas":
                get_clusters_filtered(
                    corpus.coref_clusters[COREF_TYPE_PROPOSITIONS], doc_sent_indices)
            }
        }

        return json.dumps({"reply_query": reply_query})
Ejemplo n.º 7
0
    def getInitialSummaryJson(self, clientJson):
        clientId = clientJson['clientId']
        topicId = clientJson['request_get_initial_summary']['topicId']
        questionnaireBatchIndex = clientJson['request_get_initial_summary'][
            'questionnaireBatchIndex']
        timeAllowed = clientJson['request_get_initial_summary']['timeAllowed']
        assignmentId = clientJson['request_get_initial_summary'][
            'assignmentId']
        hitId = clientJson['request_get_initial_summary']['hitId']
        workerId = clientJson['request_get_initial_summary']['workerId']
        turkSubmitTo = clientJson['request_get_initial_summary'][
            'turkSubmitTo']

        corpus_registry: CorpusRegistry = get_item("corpus_registry")
        corpus = corpus_registry.get_corpus(topicId)
        if corpus is None:
            return self.getErrorJson(
                'Topic ID not supported: {}'.format(topicId))

        m_infoManager.initClient(clientId, corpus, None, 0, None, topicId,
                                 questionnaireBatchIndex, timeAllowed,
                                 assignmentId, hitId, workerId, turkSubmitTo,
                                 QueryResultsAnalyzer())
        topicName = topicId

        m_infoManager.add_ui_action_log(
            clientId,
            UIAction("initial", {"topic_id": topicId},
                     datetime.utcnow().isoformat()))

        reply = {
            "reply_get_initial_summary": {
                "summary": [],
                "keyPhraseList": [],
                "topicName":
                topicName,
                "topicId":
                topicId,
                "documentsMetas": {
                    x.id: {
                        "id": x.id,
                        "num_sents": len(x.sentences)
                    }
                    for x in corpus.documents
                },
                "corefClustersMetas":
                get_clusters_filtered(
                    corpus.coref_clusters[COREF_TYPE_ENTITIES]),
                "eventsClustersMetas":
                get_clusters_filtered(
                    corpus.coref_clusters[COREF_TYPE_EVENTS]),
                "propositionClustersMetas":
                get_clusters_filtered(
                    corpus.coref_clusters[COREF_TYPE_PROPOSITIONS]),
                "numDocuments":
                str(len(corpus.documents)),
                "questionnaire": [],
                "timeAllowed":
                str(timeAllowed),
                "textLength":
                ""
            }
        }
        return json.dumps(reply)