def test_np_annotator_linked(model_path, settings_path, text, phrases):
    annotator = SpacyInstance(model="en", disable=["textcat", "ner", "parser"]).parser
    annotator.add_pipe(annotator.create_pipe("sentencizer"), first=True)
    annotator.add_pipe(NPAnnotator.load(model_path, settings_path), last=True)
    doc = annotator(text)
    noun_phrases = [p.text for p in get_noun_phrases(doc)]
    for p in phrases:
        assert p in noun_phrases
Exemple #2
0
def extract_noun_phrases(docs, nlp_parser, chunker):
    logger.info('extract nps from: %s', docs)
    spans = []
    for doc in nlp_parser.pipe(docs, n_threads=-1):
        if 'nlp_arch' in chunker:
            spans.extend(get_noun_phrases(doc))
        else:
            nps = list(doc.noun_chunks)
            spans.extend(nps)
    logger.info('nps= %s', str(spans))
    return spans
Exemple #3
0
def mark_noun_phrases(corpus_file,
                      marked_corpus_file,
                      nlp_parser,
                      lines_count,
                      chunker,
                      mark_char='_',
                      grouping=False):
    i = 0
    with tqdm(total=lines_count) as pbar:
        for doc in nlp_parser.pipe(corpus_file, n_threads=-1):
            if 'nlp_arch' in chunker:
                spans = get_noun_phrases(doc)
            else:
                spans = list(doc.noun_chunks)
            i += 1
            if len(spans) > 0:
                span = spans.pop(0)
            else:
                span = None
            span_written = False
            for token in doc:
                if span is None:
                    if len(token.text.strip()) > 0:
                        marked_corpus_file.write(token.text + ' ')
                else:
                    if token.idx < span.start_char or token.idx >= span.end_char:  # outside a
                        # span
                        if len(token.text.strip()) > 0:
                            marked_corpus_file.write(token.text + ' ')
                    else:
                        if not span_written:
                            # mark NP's
                            if len(span.text) > 1 and span.lemma_ != '-PRON-':
                                if grouping:
                                    text = get_group_norm(span)
                                else:
                                    text = span.text
                                # mark NP's
                                text = text.replace(' ', mark_char) + mark_char
                                marked_corpus_file.write(text + ' ')
                            else:
                                marked_corpus_file.write(span.text + ' ')
                            span_written = True
                        if token.idx + len(token.text) == span.end_char:
                            if len(spans) > 0:
                                span = spans.pop(0)
                            else:
                                span = None
                            span_written = False
            marked_corpus_file.write('\n')
            pbar.update(1)
Exemple #4
0
    def score_documents(self,
                        texts: list,
                        limit=-1,
                        return_all=False,
                        min_tf=5):
        documents = []
        assert len(texts) > 0, "texts should contain at least 1 document"
        assert min_tf > 0, "min_tf should be at least 1"
        with tqdm(total=len(texts),
                  desc="documents scoring progress",
                  unit="docs") as pbar:
            for doc in self.nlp.pipe(texts, n_threads=-1):
                if len(doc) > 0:
                    documents.append(doc)
                pbar.update(1)

        corpus = []
        for doc in documents:
            spans = get_noun_phrases(doc)
            if len(spans) > 0:
                corpus.append((doc, spans))

        if len(corpus) < 1:
            return []

        documents, doc_phrases = list(zip(*corpus))
        scorer = TextSpanScoring(documents=documents,
                                 spans=doc_phrases,
                                 min_tf=min_tf)
        tfidf_scored_list = scorer.get_tfidf_scores()
        if len(tfidf_scored_list) < 1:
            return []
        cvalue_scored_list = scorer.get_cvalue_scores()
        freq_scored_list = scorer.get_freq_scores()

        if limit > 0:
            tf = {tuple(k[0]): k[1] for k in tfidf_scored_list}
            cv = {tuple(k[0]): k[1] for k in cvalue_scored_list}
            fr = {tuple(k[0]): k[1] for k in freq_scored_list}
            tfidf_scored_list_limit = []
            cvalue_scored_list_limit = []
            freq_scored_list_limit = []
            for phrase in list(zip(*tfidf_scored_list))[0][:limit]:
                tfidf_scored_list_limit.append((phrase, tf[tuple(phrase)]))
                cvalue_scored_list_limit.append((phrase, cv[tuple(phrase)]))
                freq_scored_list_limit.append((phrase, fr[tuple(phrase)]))
            tfidf_scored_list = tfidf_scored_list_limit
            cvalue_scored_list = cvalue_scored_list_limit
            freq_scored_list = freq_scored_list_limit

        tfidf_scored_list = scorer.normalize_l2(tfidf_scored_list)
        cvalue_scored_list = scorer.normalize_l2(cvalue_scored_list)
        freq_scored_list = scorer.normalize_minmax(freq_scored_list,
                                                   invert=True)
        tfidf_scored_list = scorer.normalize_minmax(tfidf_scored_list)
        cvalue_scored_list = scorer.normalize_minmax(cvalue_scored_list)
        if return_all:
            tf = {tuple(k[0]): k[1] for k in tfidf_scored_list}
            cv = {tuple(k[0]): k[1] for k in cvalue_scored_list}
            fr = {tuple(k[0]): k[1] for k in freq_scored_list}
            final_list = []
            for phrases in tf.keys():
                final_list.append(([p for p in phrases], tf[phrases],
                                   cv[phrases], fr[phrases]))
            return final_list
        merged_list = scorer.interpolate_scores(
            [tfidf_scored_list, cvalue_scored_list], [0.5, 0.5])
        merged_list = scorer.multiply_scores([merged_list, freq_scored_list])
        merged_list = scorer.normalize_minmax(merged_list)
        final_list = []
        for phrases, score in merged_list:
            if any([len(p) > 1 for p in phrases]):
                final_list.append(([p for p in phrases], score))
        return final_list
Exemple #5
0
            nlp.add_pipe(NPAnnotator.load(_path_to_model, _path_to_params),
                         last=True)
        else:
            nlp = SpacyInstance(model='en_core_web_sm',
                                disable=['textcat', 'ner']).parser
        logger.info('spacy loaded')

        num_lines = sum(1 for line in corpus_file)
        corpus_file.seek(0)
        logger.info('%i lines in corpus', num_lines)
        i = 0

        with tqdm(total=num_lines) as pbar:
            for doc in nlp.pipe(corpus_file, n_threads=-1):
                if 'nlp_arch' in args.chunker:
                    spans = get_noun_phrases(doc)
                else:
                    spans = list(doc.noun_chunks)
                i += 1
                if len(spans) > 0:
                    span = spans.pop(0)
                else:
                    span = None
                spanWritten = False
                for token in doc:
                    if span is None:
                        if len(token.text.strip()) > 0:
                            marked_corpus_file.write(token.text + ' ')
                    else:
                        if token.idx < span.start_char or token.idx >= span.end_char:  # outside a
                            # span