Beispiel #1
0
class SentenceAnalysisSpacy(object):
    def __init__(self,
                 sentence,
                 nlp,
                 abbreviations=None,
                 normalize=True,
                 tagger=None,
                 stopwords=set()):
        self.logger = logging.getLogger(__name__)
        self._normalizer = AbstractNormalizer()
        self._abbreviations_finder = AbbreviationsParser()
        self._tagger = tagger
        self.stopwords = stopwords

        self.tags = []
        self.abbreviations = {}
        if isinstance(sentence, Doc):
            self.sentence = sentence
            self.doc = sentence
            if self._tagger is not None:
                self.tags = self._tagger.tag(self.doc.text)
        elif isinstance(sentence, Span):
            self.sentence = sentence
            self.doc = sentence.doc
            if self._tagger is not None:
                self.tags = self._tagger.tag(self.doc.text)
        elif isinstance(sentence, unicode):
            if not sentence.replace('\n', '').strip():
                raise AttributeError('sentence cannot be empty')
            if normalize:
                sentence = u'' + self._normalizer.normalize(sentence)
            if abbreviations is None:
                self.abbreviations = self._abbreviations_finder.digest_as_dict(
                    sentence)
                # self.logger.info('abbreviations: ' + str(self.abbreviations))

            if abbreviations:
                for short, long in abbreviations:
                    if short in sentence and not long in sentence:
                        sentence = sentence.replace(short, long)
            if self._tagger is not None:
                self.tags = self._tagger.tag(sentence)
            self.sentence = nlp(sentence)
            self.doc = self.sentence
        else:
            raise AttributeError(
                'sentence needs to be unicode or Doc or Span not %s' %
                sentence.__class__)
            # self.logger.debug(u'Sentence to analyse: '+self.sentence.text)

    def isNegated(self, tok):
        negations = {"no", "not", "n't", "never", "none", "false"}
        for dep in list(tok.lefts) + list(tok.rights):
            if dep.lower_ in negations:
                return True

        # alternatively look for
        #     for child in predicate.children:
        #       if child.dep_ == 'neg':
        #            context = 'Negative'
        return False

    def get_alternative_subjects(self, tok):
        '''given a token, that is a subject of a verb, extends to other possible subjects in the left part of the
        relation'''
        '''get objects in subtree of the subject to be related to objects in the right side of the verb #risk'''
        alt_subjects = [tok]
        allowed_pos = [NOUN, PROPN]
        for sibling in tok.head.children:
            # print sibling, sibling.pos_, sibling.dep_
            if sibling.pos == allowed_pos:
                for sub_subj in sibling.subtree:
                    if sub_subj.dep_ in ANY_NOUN and sub_subj.pos in allowed_pos:  # should check here that there is
                        # an association betweej the subj and these
                        # objects
                        alt_subjects.append(sub_subj)
                        # alt_subjects.append(parsed[sub_subj.left_edge.i : sub_subj.right_edge.i + 1].text.lower())
                '''get other subjects conjuncted to main on to be related to objects in the right side of the verb 
                #risk'''
                for sub_subj in sibling.conjuncts:
                    # print sub_subj, sub_subj.pos_, sub_subj.dep_
                    if sub_subj.dep_ in SUBJECTS and sub_subj.pos in allowed_pos:  # should check here that there is
                        # an association betweej the
                        # subj and these objects
                        alt_subjects.append(sub_subj)
        # print alt_subjects

        # sys.exit()
        return alt_subjects

    def get_extended_verb(self, v):
        '''
        given a verb build a representative chain of verbs for the sintax tree
        :param v:
        :return:
        '''
        verb_modifiers = [prep, agent]
        verb_path = [v]
        verb_text = v.lemma_.lower()
        allowed_lefts_pos = [NOUN, PROPN]
        lefts = list([
            i for i in v.lefts
            if i.pos in allowed_lefts_pos and i.dep_ in SUBJECTS
        ])
        '''get anchestor verbs if any'''
        if not v.dep_ == 'ROOT':
            for av in [
                    i for i in v.ancestors
                    if i.pos == VERB and i.dep not in (aux, auxpass)
            ]:
                verb_text = av.lemma_.lower() + ' ' + v.text.lower()
                verb_path.append(av)
                lefts.extend([
                    i for i in av.lefts
                    if i.pos in allowed_lefts_pos and i.dep_ in SUBJECTS
                ])
        for vchild in v.children:
            if vchild.dep in verb_modifiers:
                verb_text += ' ' + vchild.text.lower()
        return verb_text, verb_path, lefts

    def get_verb_path_from_ancestors(self, tok):
        '''
        given a token return the chain of verbs of his ancestors
        :param tok:
        :return:
        '''
        return [i for i in tok.ancestors if i.pos == VERB and i.dep != aux]

    def get_extended_token(self, tok):
        '''
        given a token find a more descriptive string extending it with its chindren
        :param tok:
        :param doc:
        :return:
        '''
        allowed_pos = [NOUN, ADJ, PUNCT, PROPN]
        if 'Parkinson' in tok.text:
            pass
        allowed_dep = [
            "nsubj",
            "nsubjpass",
            "csubj",
            "csubjpass",
            "agent",
            "expl",
            "dobj",
            "attr",
            "oprd",
            "pobj",
            # "conj",
            "compound",
            "amod",
            "meta",
            "npadvmod",
            "nmod",
            "amod"
        ]  # , add "prep" to extend for "of and "in"
        extended_tokens = [
            i for i in tok.subtree
            if (i.dep_ in allowed_dep and i in tok.children) or (i == tok)
        ]
        # just get continous tokens
        span_range = [tok.i, tok.i]
        ext_tokens_i = [i.i for i in extended_tokens]
        max_bound = max(ext_tokens_i)
        min_bound = min(ext_tokens_i)
        curr_pos = tok.i
        for cursor in range(tok.i, max_bound + 1):
            if cursor in ext_tokens_i:
                if cursor == curr_pos + 1:
                    span_range[1] = cursor
                    curr_pos = cursor

        curr_pos = tok.i
        for cursor in range(tok.i, min_bound - 1, -1):
            if cursor in ext_tokens_i:
                if cursor == curr_pos - 1:
                    span_range[0] = cursor
                    curr_pos = cursor
        span = Span(self.doc, span_range[0], span_range[1] + 1)
        return span

    def traverse_obj_children(self, tok, verb_path):
        '''
        iterate over all the children and the conjuncts to return objects within the same chain of verbs
        :param tok:
        :param verb_path:
        :return:
        '''
        for i in tok.children:
            # print i, verb_path, get_verb_path_from_ancestors(i), get_verb_path_from_ancestors(i) ==verb_path
            if i.dep_ in OBJECTS and (self.get_verb_path_from_ancestors(i)
                                      == verb_path):
                yield i
            else:
                self.traverse_obj_children(i, verb_path)
        for i in tok.conjuncts:
            # print i, verb_path, get_verb_path_from_ancestors(i), get_verb_path_from_ancestors(i) ==verb_path
            if i.dep_ in OBJECTS and (self.get_verb_path_from_ancestors(i)
                                      == verb_path):
                yield i
            else:
                self.traverse_obj_children(i, verb_path)

    def to_nltk_tree(self, node):
        def tok_format(tok):
            return " ".join(['"%s"' % tok.orth_, tok.tag_, tok.pos_, tok.dep_])

        if node.n_lefts + node.n_rights > 0:
            return nltk.Tree(
                tok_format(node),
                [self.to_nltk_tree(child) for child in node.children])
        else:
            return tok_format(node)

    def print_syntax_tree(self):
        for t in self.sentence:
            if t.dep_ == 'ROOT':
                tree = self.to_nltk_tree(t)
                if isinstance(tree, nltk.Tree):
                    tree.pretty_print(stream=self)

    def get_dependent_obj(self, tok, verb_path):
        '''
        given a token find related objects for the sape chain of verbs (verb_path
        :param tok:
        :param verb_path:
        :return:
        '''
        all_descendants = []
        if tok.dep_ in OBJECTS and (self.get_verb_path_from_ancestors(tok)
                                    == verb_path):
            all_descendants.append(tok)
        for i in tok.subtree:
            all_descendants.extend(
                list(self.traverse_obj_children(i, verb_path)))
        descendants = list(set(all_descendants))
        return_obj = [i for i in descendants]
        return return_obj

    def print_syntax_list(self):
        output = ['']
        output.append(' | '.join(('i', 'text', 'pos', 'dep', 'head')))
        for k, t in enumerate(self.sentence):
            output.append(' | '.join(
                (str(t.i), '"' + t.text + '"', t.pos_, t.dep_, t.head.text)))
        self.logger.debug('\n'.join(output))

    def collapse_noun_phrases_by_punctation(self):
        '''
        this collapse needs tobe used on a single sentence, otherwise it will ocncatenate different sentences
        :param sentence:
        :return:
        '''
        prev_span = ''
        open_brackets = u'( { [ <'.split()
        closed_brackets = u') } ] >'.split()
        for token in self.sentence:
            try:
                if token.text in open_brackets and token.whitespace_ == u'':
                    next_token = token.nbor(1)
                    if any([i in next_token.text for i in closed_brackets]):
                        span = Span(self.doc, token.i, next_token.i + 1)
                        # prev_span = span.text
                        yield span
                elif any([i in token.text for i in open_brackets]):
                    next_token = token.nbor(1)
                    if next_token.text in closed_brackets:
                        span = Span(self.doc, token.i, next_token.i + 1)
                        # prev_span = span.text
                        yield span

            except IndexError:  # skip end of sentence
                pass

    def collapse_noun_phrases_by_syntax(self):
        not_allowed_conjunction_dep = [prep]
        for token in self.sentence:
            if token.pos in [NOUN, PROPN]:
                extended = self.get_extended_token(token)
                if extended.text != token.text:
                    yield extended
                siblings = list(token.head.children)
                span_range = [token.i, token.i]
                for sibling in siblings:
                    if sibling.dep == token.dep and sibling.dep_ in not_allowed_conjunction_dep:
                        if sibling.i > token.i:
                            span_range[1] = sibling.i
                        elif sibling.i < token.i:
                            span_range[0] = sibling.i

                if span_range != [token.i, token.i]:
                    span = Span(self.doc, span_range[0], span_range[1] + 1)
                    yield span

    def analyse(self, merge_with_syntax=True, verbose=False):
        '''extract concepts'''
        '''collapse noun phrases based on syntax tree'''
        noun_phrases = list(self.collapse_noun_phrases_by_punctation())
        for np in noun_phrases:
            np.merge()
        noun_phrases = list(self.collapse_noun_phrases_by_syntax())
        if merge_with_syntax:
            for np in noun_phrases:
                np.merge()
        if verbose:
            self.print_syntax_list()
            self.print_syntax_tree()

        self.concepts = []
        noun_phrases = []
        verbs = [
            tok for tok in self.sentence
            if tok.pos == VERB and tok.dep not in (aux, auxpass)
        ]
        for v in verbs:
            verb_text, verb_path, subjects = self.get_extended_verb(v)
            rights = list([i for i in v.rights if i.pos != VERB])
            # print v, subjects, rights
            for subject in subjects:
                for any_subject in self.get_alternative_subjects(subject):
                    noun_phrases.append(any_subject)
                    for r in rights:
                        dependend_objects = self.get_dependent_obj(
                            r, verb_path)
                        for do in dependend_objects:
                            noun_phrases.append(do)
                            verb_subtree = self.doc[v.left_edge.
                                                    i:v.right_edge.i + 1].text
                            concept =dict(
                                subject=any_subject.text,
                                subject_range = dict(start=any_subject.idx,
                                                     end=any_subject.idx + len(any_subject.text)),
                                object=do.text,
                                object_range = dict(start=do.idx,
                                                    end=do.idx + len(do.text)),
                                verb=verb_text,
                                verb_path=[i.text for i in verb_path],
                                # subj_ver='%s -> %s' % (any_subject.text, verb_text),
                                # ver_obj='%s -> %s' % (verb_text, do.text),
                                # concept='%s -> %s -> %s' % (any_subject.text, verb_text, do.text),
                                negated=self.isNegated(v) or self.isNegated(any_subject) or \
                                        self.isNegated(do),
                                sentence_text = self.doc.text
                            )
                            if verb_subtree != self.doc.text:
                                concept['verb_subtree'] = verb_subtree
                            self.concepts.append(concept)
        self.noun_phrases = list(set(noun_phrases))
        # self.logger.info(self.noun_phrases)
        # for c in self.concepts:
        #     self.logger.info(c['concept'])

        # for tag in self.tags:
        #     print tag, self.doc[tag['start'], tag['end']]

    def __str__(self):
        return self.sentence.text

    def write(self, message):
        '''needed to print nltk graph to logging'''
        if message != '\n':
            self.logger.debug(message)

    def to_pos_tagged_text(self, lower=True):
        text = []
        for token in self.doc:
            if token.text and (token.pos != PUNCT
                               and token.text.lower() not in self.stopwords):
                if lower:
                    text.append(
                        token.text.strip().replace(u' ', u'_').lower() + u'|' +
                        token.pos_)
                else:
                    text.append(token.text.strip().replace(u' ', u'_') + u'|' +
                                token.pos_)
        return u' '.join(text)

    def to_text(self, lower=True):
        text = []
        for token in self.doc:
            if token.text and (token.pos != PUNCT
                               and token.text.lower() not in self.stopwords):
                text_to_append = token.text.strip().replace(u' ', u'_')
                if lower:
                    text_to_append = text_to_append.lower()
                text.append(text_to_append)
        return u' '.join(text)

    def to_ent_and_pos_tagged_text(self,
                                   ents,
                                   ref_ids,
                                   labels,
                                   lower=True,
                                   use_pos=False):
        text = []
        for token in self.doc:
            if token.text and (token.pos != PUNCT
                               and token.text.lower() not in self.stopwords):
                pos = token.pos_
                if token.i in ref_ids:
                    token_text = ref_ids[token.i]
                else:
                    token_text = token.text.strip()
                if lower:
                    token_text = token_text.lower()
                token_text = token_text.replace(u' ', u'_')
                if use_pos:
                    token_text += u'|' + pos
                if token.i in ents:
                    ent = u'_'.join(ents[token.i])
                    token_text += u'|' + ent
                if token.i in labels:
                    # try:
                    token_text += u'|' + labels[token.i]
                # except UnicodeDecodeError:
                #     pass#TODO: handle non ascii labels
                text.append(token_text)
        return u' '.join(text)
Beispiel #2
0
class DocumentAnalysisSpacy(object):
    def __init__(self, nlp, normalize=True, stopwords=None, tagger=None):

        self.logger = logging.getLogger(__name__)
        self._normalizer = AbstractNormalizer()
        self._abbreviations_finder = AbbreviationsParser()

        self.normalize = normalize
        if self.normalize:
            self._normalizer = AbstractNormalizer()
        if stopwords is None:
            self.stopwords = set(
                nltk_stopwords.words('english') +
                ["n't", "'s", "'m", "ca", "p", "t"] +
                list(ENGLISH_STOP_WORDS) + DOMAIN_STOP_WORDS +
                list(string.punctuation))

        self.nlp = nlp
        self.processed_counter = 0
        self._tagger = tagger

    def process(self, document):
        tags = []
        if isinstance(document, Doc):
            self.doc = document
            abbreviations = self._abbreviations_finder.digest_as_dict(
                self.doc.text)
            if self._tagger is not None:
                tags = self._tagger.tag(self.doc.text)

        elif isinstance(document, unicode):
            if self.normalize:
                document = u'' + self._normalizer.normalize(document)
            abbreviations = self._abbreviations_finder.digest_as_dict(document)
            if self._tagger is not None:
                tags = self._tagger.tag(document)

            # self.logger.debug('abbreviations: ' + str(abbreviations))

            if abbreviations:
                for short, long in abbreviations.items():
                    if short in document and not long in document:
                        document = document.replace(short, long)
            try:
                self.doc = self.nlp(document)
            except:
                self.logger.exception('Error parsing the document: %s' %
                                      document)
                return [None, {}]
        else:
            raise AttributeError('document needs to be unicode or Doc not %s' %
                                 document.__class__)

        allowed_tag_pos = set(
            ('NOUN', 'PROPN', 'ADJ', u'PROPN', u'NOUN', u'ADJ'))

        concepts = []
        noun_phrases = []
        self.analysed_sentences = []
        for si, sentence in enumerate(self.doc.sents):
            try:
                analysed_sentence = SentenceAnalysisSpacy(
                    sentence.text, self.nlp, stopwords=self.stopwords)
                analysed_sentence.analyse()
                self.analysed_sentences.append(analysed_sentence)
                for concept in analysed_sentence.concepts:
                    concept['sentence'] = si
                    concepts.append(concept)
                noun_phrases.extend(analysed_sentence.noun_phrases)
            except:
                self.logger.exception('Error parsing the sentence: %s' %
                                      sentence.text)

        # print self.noun_phrases
        noun_phrases = list(
            set([
                i.text for i in noun_phrases
                if i.text.lower() not in self.stopwords
            ]))

        # clustered_np = self.cluster_np(noun_phrases)
        noun_phrase_counter = Counter()
        lowered_text = self.doc.text.lower()
        for i in noun_phrases:
            lowered_np = i.lower()
            noun_phrase_counter[lowered_np] = lowered_text.count(lowered_np)
        '''remove plurals with appended s'''
        for np in noun_phrase_counter.keys():
            if np + 's' in noun_phrase_counter:
                noun_phrase_counter[np] += noun_phrase_counter[np + 's']
                del noun_phrase_counter[np + 's']
        '''increase count of shorter form with longer form'''
        for short, long in abbreviations.items():
            if short.lower() in noun_phrase_counter:
                noun_phrase_counter[long.lower()] += noun_phrase_counter[
                    short.lower()]
                del noun_phrase_counter[short.lower()]
        noun_phrases_top = [
            i[0] for i in noun_phrase_counter.most_common(5) if i[1] > 1
        ]
        noun_phrases_recurring = [
            i for i, k in noun_phrase_counter.items() if k > 1
        ]

        # bug https://github.com/explosion/spaCy/issues/589
        # self.processed_counter+=1
        # if self.processed_counter%100 == 0:
        # self.nlp.vocab.strings.flush_oov()
        '''bioentity tags'''
        '''filter tags by POS'''
        self.filtered_tags = []
        for tag in tags:
            tokens = self.get_tokens_in_range(self.doc, tag['start'],
                                              tag['end'])
            token_pos = set([i.pos_ for i in tokens])
            if token_pos & allowed_tag_pos:
                # if tag['match'] in noun_phrases:
                # print '%s >>>>> %s|%s  <<<<< %s'%(self.doc.text[(t['start'] - 10): t['start']], t['match'],
                # t['category'], self.doc.text[t['end']:t['end'] +10])
                self.filtered_tags.append(tag)
                # else:
                #     print tag['label'], tokens, token_pos
        '''filter out defined acronyms that don't agree'''
        #TODO: if the long form is tagged, search for the short form as well
        acronym_filtered_tags = []
        acronyms_to_extend = {}
        lowered_abbreviations = {i.lower(): i for i in abbreviations}
        lowered_long_forms = {i.lower(): i for i in abbreviations.values()}
        inverted_abbreviations = {
            v.lower(): k
            for k, v in abbreviations.items()
        }
        for tag in self.filtered_tags:
            matched_text = tag['match'].lower()
            if matched_text in lowered_abbreviations:
                long_description = abbreviations[
                    lowered_abbreviations[matched_text]]
                if self._tagger.get_tag_by_match(self.filtered_tags,
                                                 long_description.lower()):
                    acronym_filtered_tags.append(tag)
            else:
                acronym_filtered_tags.append(tag)
            if matched_text in lowered_long_forms:
                acronyms_to_extend[inverted_abbreviations[matched_text]] = tag
        if acronyms_to_extend:  # extend matches of long form to the short version if needed
            acronym_tags = self._tagger.extend_tags_to_alternative_forms(
                document, acronyms_to_extend)
            acronym_filtered_tags.extend(acronym_tags)
        acronyms_to_extend_lowered = [i.lower() for i in acronyms_to_extend]
        self.filtered_tags = sorted(acronym_filtered_tags,
                                    key=lambda x: (x['start'], -x['end']))
        '''remove tag matching common words'''
        tags_to_remove = []
        for i, tag in enumerate(self.filtered_tags):
            if tag['category'] in SHORT_MATCH_CASE_SENSITIVE_CATEGORIES:
                ''' use case sensive matching for short strings if the word is common'''
                if (len(tag['match']) < 4) or \
                        (len(tag['match']) < 7  and tag['match'] in COMMON_WORDS_CORPUS["brown_corpus"]):
                    original_case = document[tag['start']:tag['end']]
                    original_case_no_dash = original_case.replace('-', '')
                    original_case_dash_to_space = original_case.replace(
                        '-', ' ')
                    if not ((original_case == tag['label']) or
                            (original_case_no_dash == tag['label']) or
                            (original_case_dash_to_space == tag['label'])):
                        tags_to_remove.append(i)
            elif tag['category'] in NOISY_CATEGORIES:
                '''remove common words from matches'''
                if tag['match'] in COMMON_WORDS_CORPUS["brown_corpus"] and (
                        tag['match'] not in acronyms_to_extend_lowered):
                    tags_to_remove.append(i)

        self.filtered_tags = [
            i for j, i in enumerate(self.filtered_tags)
            if j not in tags_to_remove
        ]
        '''Tag TARGET&DISEASE sentences for open targets'''
        for i, sentence in enumerate(self.doc.sents):
            tag_in_sentence = self._tagger.get_tags_in_range(
                self.filtered_tags, sentence.start_char, sentence.end_char)
            tag_types = set([i['category'] for i in tag_in_sentence])
            if ('GENE' in tag_types) and ('DISEASE' in tag_types):
                self.filtered_tags.append(
                    MatchedTag('target-disease', sentence.start_char,
                               sentence.end_char, 'TARGET&DISEASE',
                               'OPENTARGETS', [''], '', '').__dict__)
        '''store tags for subjects and objects in concepts grouped by their category'''
        sentences_start_chars = [sent.start_char for sent in self.doc.sents]
        for concept in concepts:
            sbj_start = sentences_start_chars[
                concept['sentence']] + concept['subject_range']['start']
            sbj_end = sentences_start_chars[
                concept['sentence']] + concept['subject_range']['end']
            sbj_tags = self._tagger.get_tags_in_range(self.filtered_tags,
                                                      sbj_start, sbj_end)
            if sbj_tags:
                concept['subject_tags'] = {}
                sbj_tags_sent_idx = [deepcopy(tag) for tag in sbj_tags]
                for tag in sbj_tags_sent_idx:
                    tag['start'] -= sentences_start_chars[concept['sentence']]
                    tag['end'] -= sentences_start_chars[concept['sentence']]
                    tag_class = tag['category']
                    if tag_class not in concept['subject_tags']:
                        concept['subject_tags'][tag_class] = []
                    concept['subject_tags'][tag_class].append(tag)

            obj_start = sentences_start_chars[
                concept['sentence']] + concept['object_range']['start']
            obj_end = sentences_start_chars[
                concept['sentence']] + concept['object_range']['end']
            obj_tags = self._tagger.get_tags_in_range(self.filtered_tags,
                                                      obj_start, obj_end)
            if obj_tags:
                concept['object_tags'] = {}
                obj_tags_sent_idx = [deepcopy(tag) for tag in obj_tags]
                for tag in obj_tags_sent_idx:
                    tag['start'] -= sentences_start_chars[concept['sentence']]
                    tag['end'] -= sentences_start_chars[concept['sentence']]
                    tag_class = tag['category']
                    if tag_class not in concept['object_tags']:
                        concept['object_tags'][tag_class] = []
                    concept['object_tags'][tag_class].append(tag)

        embedding_text = {
            u'plain': self.to_text(),
            u'pos_tag': self.to_pos_tagged_text(),
            u'ent_tag': self.to_entity_tagged_text()
        }
        return self.doc, \
               dict(chunks=noun_phrases,
                    recurring_chunks=noun_phrases_recurring,
                    top_chunks=noun_phrases_top,
                    abbreviations=[dict(short=k, long=v) for k, v in abbreviations.items()],
                    concepts=concepts,
                    tagged_entities=self.filtered_tags,
                    tagged_entities_grouped=self._tagger.group_matches_by_category_and_reference(self.filtered_tags),
                    tagged_text=self._tagger.mark_tags_in_text(self.doc.text, self.filtered_tags),
                    embedding_text=embedding_text)

    @staticmethod
    def get_tokens_in_range(doc, start, end):
        tokens = []
        for t in doc:
            if start <= t.idx <= end and \
                                    start <= (t.idx + len(t.text)) <= end + 1:
                tokens.append(t)
            elif t.idx > end:
                break
        return tokens

    def digest(self, document):
        return self.process(document)[1]

    def __str__(self):
        return 'nlp'

    def cluster_np(self, noun_phrases):
        '''todo: optimise for speed'''

        clusters = {i: [i] for i in noun_phrases}
        for i in noun_phrases:
            for j in noun_phrases:
                if i != j and i in j.split(' '):
                    # print '%s -> %s'%(i,j)
                    clusters[j].extend(clusters[i])
                    del clusters[i]
                    # elif i != j and j in i:
                    #     print '%s <- %s'%(j,i)
        # pprint(clusters)
        filtered_noun_phrases = []
        for k, v in clusters.items():
            if len(v) > 1:
                longest = sorted(v, key=lambda x: len(x), reverse=True)[0]
                filtered_noun_phrases.append(longest)
            else:
                filtered_noun_phrases.append(v[0])
        # print filtered_noun_phrases
        return filtered_noun_phrases

    def to_pos_tagged_text(self, lower=True):
        text = []
        for sent in self.analysed_sentences:
            text.append(sent.to_pos_tagged_text(lower=lower))
        return u'\n'.join(text)

    def to_text(self, lower=True):
        text = []
        for sent in self.analysed_sentences:
            text.append(sent.to_text(lower=lower))
        return u'\n'.join(text)

    def to_entity_tagged_text(self,
                              tags2skip=['TARGET&DISEASE'],
                              lower=True,
                              use_pos=False):
        token_tags = {}
        token_refid = {}
        token_labels = {}
        for s_i, s in enumerate(self.doc.sents):
            token_tags[s_i] = {}
            token_refid[s_i] = {}
            token_labels[s_i] = {}
        for tag in self.filtered_tags:
            for s_i, s in enumerate(self.doc.sents):
                if tag['start']>= s.start_char and \
                    tag['end'] <= s.end_char:
                    tag['sentence'] = s_i
                    analyzed_sentence_doc = self.analysed_sentences[s_i].doc
                    tokens = self.get_tokens_in_range(
                        analyzed_sentence_doc, tag['start'] - s.start_char,
                        tag['end'] - s.start_char)
                    if tag['category'] not in tags2skip:
                        for token in tokens:
                            if token.i not in token_tags[s_i]:
                                token_tags[s_i][token.i] = []
                            token_tags[s_i][token.i].append(tag['category'])
                            if token.i not in token_refid[s_i]:
                                ref = tag['reference']
                                if '/' in ref:  #workaround for uris
                                    ref = ref.split('/')[-1]
                                token_refid[s_i][token.i] = ref
                            if token.i not in token_labels[s_i]:
                                token_labels[s_i][
                                    token.i] = MatchedTag.sanitize_string(
                                        tag['label'])
                    break
        # for s_i, s in enumerate(self.doc.sents):
        #     analyzed_sentence_doc = self.analysed_sentences[s_i].doc
        #     for token in analyzed_sentence_doc:
        #         if token.i not in token_tags[s_i]:
        #             token_tags[s_i][token.i] = [token.pos_]

        text = []
        for s_i, sent in enumerate(self.analysed_sentences):
            sent_text = sent.to_ent_and_pos_tagged_text(
                ents=token_tags[s_i],
                ref_ids=token_refid[s_i],
                labels=token_labels[s_i],
                lower=lower,
                use_pos=use_pos)
            text.append(sent_text.encode('ascii', errors='ignore'))
        return u'\n'.join(text)