Esempio n. 1
0
def test_sentencizer_serialize_bytes(en_vocab):
    punct_chars = [".", "~", "+"]
    sentencizer = Sentencizer(punct_chars=punct_chars)
    assert sentencizer.punct_chars == punct_chars
    bytes_data = sentencizer.to_bytes()
    new_sentencizer = Sentencizer().from_bytes(bytes_data)
    assert new_sentencizer.punct_chars == punct_chars
Esempio n. 2
0
def test_sentencizer_serialize_bytes(en_vocab):
    punct_chars = [".", "~", "+"]
    sentencizer = Sentencizer(punct_chars=punct_chars)
    assert sentencizer.punct_chars == set(punct_chars)
    bytes_data = sentencizer.to_bytes()
    new_sentencizer = Sentencizer().from_bytes(bytes_data)
    assert new_sentencizer.punct_chars == set(punct_chars)
Esempio n. 3
0
def process_text(text: str, debug: bool = False) -> Dictionary:
    """
    Process given text through NLP pipes.
    """

    nlp = spacy.load("en_core_web_sm")
    sentencizer = Sentencizer()
    nlp.add_pipe(sentencizer, before="parser")
    doc = nlp(text)
    dictionary_path = f"{NLP_SERVICE_ROOT}/assets/dictionary.pickle"

    if os.path.isfile(dictionary_path) and not debug:
        with open(dictionary_path, "rb") as f:
            dictionary = pickle.load(f)
    else:
        dictionary = Dictionary(debug=False)
        with open(dictionary_path, "wb") as w:
            pickle.dump(dictionary, w)

    for sentence in doc.sents:
        for token in sentence:
            relic = Relic(token, sentence)
            dictionary.add(relic)

    return dictionary
Esempio n. 4
0
def make_nlp():
    '''
    Generates spaCy nlp object and adds pipelines.
    Returns:
        an nlp object
    '''
    nlp = spacy.load("en_core_web_sm")
    sentencizer = Sentencizer(punct_chars=['.'])
    ruler = make_entity_ruler(nlp)
    Token.set_extension('is_solitarious', default=None, force=True)
    Span.set_extension('subject_decline', default=False, force=True)
    Span.set_extension('contains_adults', default=None, force=True)
    Span.set_extension('ent_solitarious', default=None, force=True)
    merge_ents = nlp.create_pipe("merge_entities")
    combine_ents_ruler = combine_entities_ruler(nlp)
    nlp.add_pipe(sentencizer, first=True)
    nlp.add_pipe(ruler, before='ner')
    nlp.add_pipe(refine_entities)
    nlp.add_pipe(subject_decline)
    nlp.add_pipe(merge_ents)
    nlp.add_pipe(combine_ents_ruler)
    nlp.add_pipe(is_solitarious)
    nlp.add_pipe(contains_adults)
    nlp.add_pipe(ent_solitarious)

    return nlp
Esempio n. 5
0
def test_sentencizer_custom_punct(en_vocab, punct_chars, words, sent_starts, n_sents):
    doc = Doc(en_vocab, words=words)
    sentencizer = Sentencizer(punct_chars=punct_chars)
    doc = sentencizer(doc)
    assert doc.is_sentenced
    assert [t.is_sent_start for t in doc] == sent_starts
    assert len(list(doc.sents)) == n_sents
Esempio n. 6
0
def test_sentencizer_complex(en_vocab, words, sent_starts, n_sents):
    doc = Doc(en_vocab, words=words)
    sentencizer = Sentencizer()
    doc = sentencizer(doc)
    assert doc.is_sentenced
    assert [t.is_sent_start for t in doc] == sent_starts
    assert len(list(doc.sents)) == n_sents
def test_sentencizer_complex(en_vocab, words, sent_starts, sent_ends, n_sents):
    doc = Doc(en_vocab, words=words)
    sentencizer = Sentencizer(punct_chars=None)
    doc = sentencizer(doc)
    assert doc.has_annotation("SENT_START")
    assert [t.is_sent_start for t in doc] == sent_starts
    assert [t.is_sent_end for t in doc] == sent_ends
    assert len(list(doc.sents)) == n_sents
Esempio n. 8
0
def test_flatten_docs_to_sens(vocab):
    sentencizer = Sentencizer(".")
    nlp = spacy.blank("en")
    nlp.add_pipe(sentencizer)
    texts = ["Foo is bar. Bar is baz.", "It is a sentence."]
    docs = nlp.pipe(texts)
    all_sents = flatten_docs_to_sents(docs)
    assert len(all_sents) == 3
Esempio n. 9
0
def test_sentencizer(en_vocab):
    doc = Doc(en_vocab, words=["Hello", "!", "This", "is", "a", "test", "."])
    sentencizer = Sentencizer()
    doc = sentencizer(doc)
    assert doc.is_sentenced
    sent_starts = [t.is_sent_start for t in doc]
    assert sent_starts == [True, False, True, False, False, False, False]
    assert len(list(doc.sents)) == 2
Esempio n. 10
0
def break_into_sentences(txt):
    sentencizer = Sentencizer()
    nlp = English()
    sentencizer = nlp.create_pipe("sentencizer")
    nlp.add_pipe(sentencizer)
    doc = nlp(txt)
    span_list = list(doc.sents)
    sentence_list = [t.text for t in span_list]
    return sentence_list
Esempio n. 11
0
 def __init__(self, nlp):
     """"""
     self.nlp = nlp
     self.sentencizer = Sentencizer()
     # https://github.com/explosion/spaCy/issues/3569
     try:
         self.nlp.add_pipe(self.sentencizer, first=True)
     except:
         # already added
         pass
def test_sentencizer(en_vocab):
    doc = Doc(en_vocab, words=["Hello", "!", "This", "is", "a", "test", "."])
    sentencizer = Sentencizer(punct_chars=None)
    doc = sentencizer(doc)
    assert doc.has_annotation("SENT_START")
    sent_starts = [t.is_sent_start for t in doc]
    sent_ends = [t.is_sent_end for t in doc]
    assert sent_starts == [True, False, True, False, False, False, False]
    assert sent_ends == [False, True, False, False, False, False, True]
    assert len(list(doc.sents)) == 2
Esempio n. 13
0
def remove_duplicates(txt):

    #first check via "\n" to remove duplicates
    new_txt = unique(txt.split("\n"))

    sentencizer = Sentencizer()
    nlp = English()
    sentencizer = nlp.create_pipe("sentencizer")
    nlp.add_pipe(sentencizer)
    doc = nlp(new_txt)
    span_list = list(doc.sents)
    sentence_list = [t.text for t in span_list]
    summary = unique(sentence_list)
    return summary
Esempio n. 14
0
    def __init__(self,
                 archive_path: str,
                 predictor_name: str,
                 text_path: str,
                 cuda_device: int,
                 language: str = "en_core_web_sm"):
        archive = load_archive(archive_path, cuda_device=0)
        self.predictor = TextPredictor.from_archive(
            archive, predictor_name=predictor_name)

        self._nlp = spacy.load(language)
        sentencizer = Sentencizer()
        self._nlp.add_pipe(sentencizer)

        self.text = self.read_lines(text_path)
Esempio n. 15
0
def tokenize(full_input_path, corpus):

    with open(full_input_path, "r") as f:
        text = f.read()
        text = re.sub("\n", "", text)
        text = re.sub("\ufeff", "", text)

    if corpus == "OCD":
        nlp = spacy.load("en_core_web_sm", disable=["tagger", "ner", "parser"])
        nlseg = NewLineSegmenter()
        nlp.add_pipe(nlseg.set_sent_starts, name='sentence_segmenter')
    elif corpus == "OE":
        nlp = spacy.load("en_core_web_sm", disable=["tagger", "ner", "parser"]) 
        sentencizer = Sentencizer(punct_chars=[".", "\n"])  
        nlp.add_pipe(sentencizer) 
    doc = nlp(text)

    tokenized_sents = []
    for sent in doc.sents:
        tokens = []
        for token in sent:
            if token.text == "(":
                token_text = "-LRB-"
            elif token.text == ")":
                token_text = "-RRB-"
            elif token.text in [":)", ":-)", ":(", ":-("]:
                token_text = "-EMOJI-"
            else:
                token_text = token.text
            tokens.append(token_text)
        tokenized_sent = " ".join(tokens)
        tokenized_sents.append(tokenized_sent)

    if corpus == "OCD":    
        tokenized_text = "".join(tokenized_sents)
    elif corpus == "OE":
        non_empty_sents = []
        for sent in tokenized_sents:
            non_empty_sents.append(sent.lstrip())
        tokenized_text = "\n".join(non_empty_sents)
    return tokenized_text
Esempio n. 16
0
def generate_knowledge_graph(text):
    doc_title = str(time.time())
    sentencizer = Sentencizer()
    doc = nlp(text)
    clean_data = []
    n = 0
    for sents in doc.sents:
        if len(str(sents).replace("\n", "")) > 0:
            clean = str(sents).replace("\n", "")
            if clean.strip() != "" and validateString(clean):
                clean_data.append(clean)
                n = n + 1
    print(n)
    entity_pairs = []
    for data in tqdm(clean_data):
        entity_pairs.append(get_entities(data))
    print("\nEntity Extraction completed")

    relations = [get_relation(i) for i in clean_data]
    source = []
    target = []
    edge = []
    indexes = []

    for i in tqdm(range(len(entity_pairs))):
        if validateAlpha(entity_pairs[i][0]) and validateAlpha(
                entity_pairs[i][1]) and validateString(relations[i]):
            ent1 = removeStop(entity_pairs[i][0])
            ent2 = removeStop(entity_pairs[i][1])
            rel = relations[i]
            if validateAlpha(ent1.lower()) and validateAlpha(ent2.lower()):
                source.append(ent1.lower().strip())
                target.append(ent2.lower().strip())
                edge.append(rel)
                indexes.append(i)
    print("\nTotal number of extracted pairs:", len(edge))
    print("\nEdges: ", edge)
    print("\nEntities: ", entity_pairs)
    if (len(edge) == 0 or len(entity_pairs) == 0):
        return False
    else:
        G = nx.DiGraph(directed=True)

        for i in tqdm(range(len(edge))):
            G.add_weighted_edges_from([(source[i], target[i], i)])

        print("\nGraph generated")
        size = 20
        if len(edge) / 2 > 20:
            size = len(edge) / 2
        plt.figure(figsize=(size, size))
        edge_labels = dict([((
            u,
            v,
        ), edge[d['weight']]) for u, v, d in G.edges(data=True)])

        pos = nx.spring_layout(G, k=0.8)
        nx.draw(G,
                with_labels=True,
                node_color='skyblue',
                node_size=5000,
                edge_color='r',
                edge_cmap=plt.cm.Blues,
                pos=pos,
                font_size=20)
        nx.draw_networkx_edge_labels(G,
                                     pos,
                                     edge_labels=edge_labels,
                                     font_size=15)

        plt.title("KNOWLEDGE GRAPH FOR DOCUMENT: " + doc_title,
                  fontdict={'fontsize': 50})
        plt.savefig(os.path.join(IMAGE_DIR, doc_title + ".png"))

        return os.path.join(IMAGE_DIR, doc_title + ".png")
Esempio n. 17
0
def test_sentencizer_across_scripts(lang, text):
    nlp = spacy.blank(lang)
    sentencizer = Sentencizer()
    nlp.add_pipe(sentencizer)
    doc = nlp(text)
    assert len(list(doc.sents)) > 1
Esempio n. 18
0
def prepare_model(model="en_core_web_md"):
    nlp = spacy.load(model)
    sentencizer = Sentencizer()
    nlp.add_pipe(sentencizer, before="parser")
    return nlp
Esempio n. 19
0
import spacy
import string
from spacy.pipeline import Sentencizer

nlp = spacy.load("de_core_news_md")
sentencizer = Sentencizer(punct_chars=[char for char in string.punctuation])
nlp.add_pipe(sentencizer, name="sentence_segmenter", before="parser")


def get_oie(corpus):
    # decision logic for extracting roots and terms - for better analysis sentences are passed as well

    roots = []
    terms = []
    sents = []

    doc = nlp(corpus.lower())

    for sent in doc.sents:
        t = set()
        # get sentences
        sents.append(sent.text)

        # get important tokens from sentence
        pd, oc, ng = "", "", ""
        for token in sent:
            if token.dep_ == "pd":
                pd = token.lemma_
            if token.dep_ == "oc":
                oc = token.lemma_
            if token.dep_ == "ng" and token.head.dep_ == "ROOT":
def sentencizer():
    return Sentencizer()
Esempio n. 21
0
"""
Tests for backend/nlp/src/services/dictionary.py
"""
import spacy
from spacy.pipeline import Sentencizer

from services import dictionary
from shared.tests.base import TestsBaseClass


nlp = spacy.load("en_core_web_sm")
sentencizer = Sentencizer()
nlp.add_pipe(sentencizer, before="parser")
sample_text = "You are not prepared!"
doc = nlp(sample_text)
for sentence in doc.sents:
    for token in sentence:
        relic = dictionary.Relic(token, sentence)
        break
    break


class DictionaryTests(TestsBaseClass):
    """
    Tests for backend/nlp/src/services/dictionary.py
    """

    def test_01_test_relic_class_init(self):
        """
        Should contain required properties.
        """
Esempio n. 22
0
def extract_segments(file_text, file_parse, corpus):
    """
    :param doc_text: absolute path to the text file, where each line is a sentence
    :param doc_parse: absolute path to the parse file, where each line is a parsed sentence as a phrase-structure three
    :return segments_dict: dict, where each key is a segment id, and each value is a dictionary with the text, start index and end index of the segment
    """

    with open(file_text, "r", encoding="utf-8") as f:
        doc_text = f.read()

    if corpus == "OCD":
        nlp = spacy.load("en_core_web_sm", disable=["tagger", "ner", "parser"])
        nlseg = NewLineSegmenter()
        nlp.add_pipe(nlseg.set_sent_starts, name='sentence_segmenter')
    elif corpus == "OE":
        nlp = spacy.load("en_core_web_sm", disable=["tagger", "ner", "parser"])
        sentencizer = Sentencizer(punct_chars=["."])
        nlp.add_pipe(sentencizer)
    doc = nlp(doc_text)

    with open(file_parse, "r") as f:
        sent_parses = f.readlines()

    segments_list = []

    sent_index = 0
    for sent, parse in zip(doc.sents, sent_parses):
        tokens = {}
        for index, token in enumerate(sent):
            tokens[index] = {}
            start = token.idx
            end = token.idx + len(token.text)
            tokens[index]["start"] = start
            tokens[index]["end"] = end
            tokens[index]["text"] = doc_text[start:end]

        # get segments from the parse tree
        t = Tree.fromstring(parse)

        for index, treepos in enumerate(t.treepositions("leaves")):
            t[treepos] = index

        segments_ids = []
        for st in t.subtrees():
            # save segment if it is not already saved and if it does not contain only terminals (height=2)
            if corpus == "OCD":
                # exclude punctuation leaves
                st_leaves = [
                    leaf[0] for leaf in st.pos() if leaf[1] not in [
                        "#", "$", '"', "``", "(", ")", "-LRB-", "-RRB-", ",",
                        ":", "."
                    ]
                ]
            elif corpus == "OE":
                st_leaves = st.leaves()
            if st_leaves not in segments_ids and st.height() > 2 and len(
                    st_leaves) > 0:
                segments_ids.append(st_leaves)
        for index, segment in enumerate(segments_ids):
            segment_start = tokens[segment[0]][
                "start"] + sent_index  # shift the index to fit the bug in the eval script
            segment_end = tokens[segment[-1]][
                "end"] + sent_index  # shift the index to fit the bug in the eval script
            segment_text = doc_text[segment_start:segment_end]
            segments_list.append({
                "start": segment_start,
                "end": segment_end,
                "text": segment_text
            })
        sent_index += 1

    segments_dict = {}
    for index, data in enumerate(segments_list):
        segments_dict[index] = data

    return segments_dict
Esempio n. 23
0
import numpy as np
import librosa
import argparse
import torch

import soundfile as sf
import pyrubberband as pyrb

import re
import spacy
from spacy.pipeline import Sentencizer


# Prepare NLP pipeline
nlp = spacy.load("en_core_web_sm", disable=["tagger","parser", "ner"])
sentencizer = Sentencizer(punct_chars=[".", "?", "!", ":", "..."])
nlp.add_pipe(sentencizer)

"""#### Prepare the models"""

# Print some environment information (for debugging purposes)

    ## Print some environment information (for debugging purposes)
print("Running a test of your configuration...\n")
if not torch.cuda.is_available():
    print("Your PyTorch installation is not configured to use CUDA. If you have a GPU ready "
          "for deep learning, ensure that the drivers are properly installed, and that your "
          "CUDA version matches your PyTorch installation. CPU-only inference is currently "
          "not supported.", file=sys.stderr)
    quit(-1)
device_id = torch.cuda.current_device()
Esempio n. 24
0
 def build_sentencizer(self, verbose=False):
     self.nlp_sentencizer = English()
     sentencizer = Sentencizer()
     self.nlp_sentencizer.add_pipe(component=sentencizer)
     if verbose:
         print("pipe names: {}".format(self.nlp_sentencizer.pipe_names))
Esempio n. 25
0
def custom_sentencizer(texts):
    # disabling Named Entity Recognition for speed
    nlp = spacy.load('en_core_web_sm', disable=['ner', 'parser'])
    boundary_1 = re.compile(r'\b(fig)\b|\b(ex)\b|\b[a-z]\b|\b(prof)\b|'
                            r'\b(eg)\b|\b(etc)\b|\b(sp)\b|\b(an)\b|'
                            r'\b(pp)\b|\b(vol)\b|\b(col)\b')
    boundary_2_2 = re.compile(r'\b(et)\b')
    boundary_2_1 = re.compile(r'\b(al)\b')
    boundary_p = re.compile(r'[.?!:;’“”\"\'0-9]|(wAnM.)')
    boundary_a = re.compile(r'\b(th)\b|\b(st)\b|\b(nd)\b|\b(rd)\b|[-]')
    boundary_d = re.compile(r'[0-9]')

    # Utility functions
    def custom_seg_1(doc):
        prev = doc[0].text
        length = len(doc)
        for index, token in enumerate(doc):
            if (token.text == '.' and boundary_1.match(prev.lower())
                    and index != (length - 1)):
                doc[index + 1].sent_start = False
            prev = token.text
        return doc

    def custom_seg_2(doc):
        length = len(doc)
        # If single token, return
        if length < 2:
            return doc
        # If multiple token, apply rule
        else:
            prev_2 = doc[0].text
            prev_1 = doc[1].text
            for index, token in enumerate(doc):
                if index > 0:
                    if (((token.text == '.') | (token.text == '.,') |
                         (token.text == '.('))
                            and boundary_2_2.match(prev_2.lower())
                            and boundary_2_1.match(prev_1.lower()) and index !=
                        (length - 1)):

                        if ((doc[index + 1].text == ',')
                                or (doc[index + 1].text == '(')):
                            doc[index + 2].sent_start = False
                        else:
                            doc[index + 1].sent_start = False

                    prev_1 = token.text
                    prev_2 = doc[index - 1].text
            return doc

    def custom_seg_3(doc):
        prev = doc[0].text
        length = len(doc)
        for index, token in enumerate(doc):
            if ((token.text == '\n' or token.text == '\n ')
                    and not boundary_p.match(prev.lower()) and index !=
                (length - 1)):
                doc[index + 1].sent_start = False
            prev = token.text
        return doc

    def custom_seg_4(doc):
        succ = doc[1].text
        length = len(doc)
        for index, token in enumerate(doc):
            if index < (length - 2):
                if ((token.text == '\n' or token.text == '\n ')
                        and boundary_a.match(succ.lower())):
                    doc[index + 1].sent_start = False
                succ = doc[index + 2].text
        return doc

    def custom_seg_5(doc):
        prev = doc[0].text
        succ = doc[1].text
        length = len(doc)
        for index, token in enumerate(doc):
            if index < (length - 2):
                if ((token.text == '\n' or token.text == '\n ')
                        and boundary_d.match(prev.lower())):
                    if ~succ.isupper():
                        doc[index + 1].sent_start = False
                prev = token.text
                succ = doc[index + 2].text
        return doc

    def brief_cleaning_fun(text):
        text = BeautifulSoup(html.unescape(text),
                             'lxml').text  # remove HTML tags
        text = re.sub(r'https?://\S+', '', str(text))  # remove URLs
        # Remove question marks inside parenthesis (they mess up sentence splitting)
        text = re.sub(r'\(\?', r'\(', str(text))
        # Remove question marks inside parenthesis (they mess up sentence splitting)
        text = re.sub(r'\?\)', r'\)', str(text))
        text = re.sub(r'(\? \()', r' \(', str(text))
        # Replace \xa0\n tags with space
        text = re.sub(r'(\xa0\n)', ' ', str(text))
        text = re.sub(r'(\xa0 \n)', ' ', str(text))
        # Replace -e.g. with e.g.
        text = re.sub(r'(-e.g.)', 'e.g.', str(text))
        # Replace \xa0 tags with space
        text = re.sub(r'(\xa0)', ' ', str(text))
        # Remove newline characters between brackets.
        text = re.sub(r'(?s)(?<=[\(]).*?(?=[\)])',
                      lambda x: x.group().replace('\n', ' '), str(text))
        # Replace space between linebreak characters
        text = re.sub(r'(?<=(\n)) *(?=(\n))', '', str(text))
        # Replace multiple occurrences of whitespace characters with single one
        text = re.sub(r'(\s)\1{1,}', r'\1', str(text))
        # Replace square brackets
        text = re.sub(r'[\[\]]', '', str(text))
        # Replace occurrences of newline character before a comma
        text = re.sub(r'(\n,)|(\n ,)', r',\n', str(text))
        # keep only certain characters
        text = re.sub(r"[^a-zA-Z0-9,'‘’“”\":;.?!\(\)\-(\n)]", ' ', str(text))
        return text

    def sentence_cleaning_fun(text):
        if not text:
            text = 'nan'
        return text

    def sentencization(doc):
        # Accept only documents with more than two words (one word and one punctuation.)
        if len(doc) > 2:
            text = [
                sentence_cleaning_fun(token.text).strip().split()
                for token in doc.sents
            ]
            # Remove short sentences
            text = [' '.join(x) for x in text if len(x) > 1]
            # Remove bracket characters if they are at the end of a sentence.
            # Remove space between digit and "st","nd","rd" and "th"
            for i, x in enumerate(text):
                if x[-1] in ['(', ')']:
                    text[i] = x[:-1]
                text[i] = re.sub(
                    r"(?i)\b[0-9][0-9]*\b (\bst\b|\bnd\b|\brd\b|\bth\b)",
                    lambda x: x.group().replace(' ', ''), str(text[i]))

            if text:
                return text
            else:
                return 'nan'
        else:
            return 'nan'

    sentencizer = Sentencizer(punct_chars=[
        ".", "?", "!", "\n", "\n ", "\n\n", "\n \n ", "\n\n\n", "\n \n \n ",
        "\n\n\n\n", "\n \n \n \n ", "\n\n\n\n\n", "\n \n \n \n \n "
    ])
    nlp.add_pipe(sentencizer)
    nlp.add_pipe(custom_seg_1, after='sentencizer')
    nlp.add_pipe(custom_seg_2, after='custom_seg_1')
    nlp.add_pipe(custom_seg_3, after='custom_seg_2')
    nlp.add_pipe(custom_seg_4, after='custom_seg_3')
    nlp.add_pipe(custom_seg_5, after='custom_seg_4')

    brief_cleaning = (brief_cleaning_fun(row) for row in texts)
    texts_processed = [
        sentencization(doc)
        for doc in tqdm(nlp.pipe(brief_cleaning, batch_size=25),
                        total=len(texts))
    ]

    return texts_processed
Esempio n. 26
0
import spacy
import string
import re
from collections import Counter
from spacy.pipeline import Sentencizer
from spacy.matcher import PhraseMatcher

nlp = spacy.load("de_core_news_md")
sentencizer = Sentencizer(punct_chars=[".", "?", "!", ",", ";", ":"])
nlp.add_pipe(sentencizer, name="sentence_segmenter", before="parser")


def get_abbr():
    # load list of german abbreviations for normalizing
    # taken from https://de.wiktionary.org/wiki/Kategorie:Abk%C3%BCrzung_(Deutsch)

    with open("resources/abbreviations_ger.txt", "r", encoding="utf-8") as f:
        x = f.readlines()
    abbreviations = [item.rstrip('\n') for item in x]

    return abbreviations


def sentences(corpus, no_questions):
    # potentially add root form support
    terms = [
        "wohin", "wie", "woher", "was", "wieso", "warum", "wer", "welche",
        "wen", "wem", "wo", "?"
    ]
    matcher = PhraseMatcher(nlp.vocab)
Esempio n. 27
0
def craft_input_to_bolstm():
    """Convert the documents in the CRAFT corpus to the input structure of BO-LSTM."""

    # Sentence segmentation using Spacy
    nlp = English()
    sentencizer = Sentencizer()
    nlp.add_pipe(sentencizer)

    # Parse each document in corpus directory -
    corpus_dir = "chebi_craft_corpus/"
    docs_list = os.listdir(corpus_dir)

    for idoc, file in enumerate(docs_list):

        if file[-3:] == "xmi":
            file_path = corpus_dir + file
            file_id = str(file[:-4])

            #Retrieve the entire document text
            tree = ET.parse(file_path)
            root = tree.getroot()

            for child in root:
                if child.tag == "{http:///uima/cas.ecore}Sofa":
                    document_text = child.attrib["sofaString"]

            # Import annotations from annotations file into annotation_list
            annotation_list = []
            annotation_file = open(file_path[:-3] + "ann", "r")

            for line in annotation_file.readlines():
                entity_text = line.split("\t")[2].strip("\n")
                ontology_id = line.split("\t")[1].split(" ")[0].replace(
                    "_", ":")
                offset_begin = int(line.split("\t")[1].split(" ")[1])
                offset_end = int(
                    line.split("\t")[1].split(" ")[2].split(";")[0])
                annotation_list.append(
                    (entity_text, ontology_id, offset_begin, offset_end))

            annotation_file.close()

            # Create the xml tree for output file
            new_root = ET.Element("document")
            new_root.set("id", file_id)

            # Iterate over each sentence in document
            docSpacy = nlp(document_text)
            sentence_count, token_count = 0, 0

            for sentence in docSpacy.sents:
                sentence_count += 1
                begin_offset = token_count + 1
                token_count += len(sentence.text) + 1
                final_offset = token_count
                sentence_id = str(file_id) + ".s" + str(sentence_count)
                entity_count = 0
                entity_check = []

                # Create xml structure for sentence
                new_sentence = ET.SubElement(new_root, "sentence")
                new_sentence.set("id", sentence_id)
                new_sentence.set("text", sentence.text)

                # Check if there is any annotation present in the current sentence
                valid_entities_list = []

                for annotation in annotation_list:

                    if annotation[2] >= begin_offset and annotation[
                            2] <= final_offset:
                        # There is an annotation in this sentence
                        entity_text = annotation[0]

                        if entity_text not in entity_check:  # The entity was not added to sentence

                            #Upgrade the entity offset in sentence context
                            entity_begin_offset = sentence.text.find(
                                entity_text)

                            if entity_begin_offset > -1:
                                entity_count += 1
                                entity_id = sentence_id + ".e" + str(
                                    entity_count)
                                entity_final_offset = entity_begin_offset + len(
                                    entity_text) - 1
                                entity_offset = str(
                                    entity_begin_offset) + "-" + str(
                                        entity_final_offset)
                                entity_check.append(entity_text)
                                valid_entities_list.append(entity_id)

                                # Create xml structure for annotation
                                new_entity = ET.SubElement(
                                    new_sentence, "entity")
                                new_entity.set("id", entity_id)
                                new_entity.set("charOffset", entity_offset)
                                new_entity.set("type", "chebi")
                                new_entity.set("text", entity_text)
                                new_entity.set("ontology_id", annotation[1])

                # Create Xml structure for pairs of entities in sentence
                pair_count = 0
                pair_check = []

                for valid_entity in valid_entities_list:

                    for valid_entity_2 in valid_entities_list:
                        print(valid_entity)
                        if valid_entity != valid_entity_2:  # Create a pair between two different entities
                            pair_check_id1 = valid_entity + "_" + valid_entity_2
                            pair_check_id2 = valid_entity_2 + "_" + valid_entity

                            if pair_check_id1 not in pair_check and pair_check_id2 not in pair_check:  # Prevent duplicate pairs
                                pair_count += 1
                                pair_id = sentence_id + ".p" + str(pair_count)
                                pair_check.append(pair_check_id1)
                                pair_check.append(pair_check_id2)

                                new_pair = ET.SubElement(new_sentence, "pair")
                                new_pair.set("id", pair_id)
                                new_pair.set("e1", valid_entity), new_pair.set(
                                    "e2", valid_entity_2)
                                new_pair.set("ddi", "false")

            #Create an .xml output file
            ET.ElementTree(new_root).write("./bolstm/converted_chebi_craft/" +
                                           file_id + ".xml",
                                           xml_declaration=True)
Esempio n. 28
0
import spacy
from spacy.pipeline import Sentencizer
import pandas as pd

from preprocess import preprocess, construct_spacy_obj
import ft
import train
from feature_extraction import feature_extraction
from classifiation import classify

nlp = spacy.load('en_core_web_sm')
sentencizer = Sentencizer(punct_chars=[".", "!", "?", "\n", "\r", ";"])
nlp.add_pipe(sentencizer)

ft_model = ft.get_model()
model = train.get_model(nlp, ft_model)


def get_features_and_classification(filename):
    df = pd.read_csv("csv_files/" + filename,
                     header=None,
                     names=['reviewText', 'rating'])
    df = preprocess(df, nlp)
    df = construct_spacy_obj(df, nlp)

    features = feature_extraction(df, ft_model, nlp)
    result, _, __ = classify(df, features, model)

    return features, result
Esempio n. 29
0
    def __init__(self,
                 dataset=None,
                 entity_labels=None,
                 no_rel_label=None,
                 no_rel_multiple=False,
                 sentence_align=False,
                 test=False,
                 same_entity_relation=False,
                 write_Entites=False,
                 generalize=False,
                 parallelize=False,
                 no_of_cores=64,
                 predictions_folder=None,
                 de_sample=None):
        """
           Data files are read in and the sentence where the entitiy pair is located is segmented into 5
           along with the labels and the track information (file number, entity1 and entity 2) that helps to write predictions
           back to file.
           :param dataset: path to dataset
           :param predictions_folder: path to predictions (output) folder
           :param entity_labels: labels of the list of entities that create the relations
           :param no_labels: name the label when entities that do not have relations in a sentence are considered
           :param no_rel_multiple: flag whether multiple labels are possibles for No-relation
           :param sentence_align: options to break sentences
           :param test: flag to run test-segmentation options
           :param same_entity_relation: flag when relation exists between same type of entities
           :param de_sample: flag to reduce the no of samples
           :param generalize: flag when relations are not dependent on the first given relation label
           :param parallelize: flag to parallelize the segmentation
           :param no_of_cores: no of cores to run the parallelized segmentation
           :param write_Entites: write entities and predictions to file
           :param with_labels: Take the labels of the entites into consideration during segmentation

        """
        self.predictions_folder = predictions_folder
        self.dataset = dataset
        self.entity_labels = entity_labels
        self.test = test
        self.same_entity_relation = same_entity_relation
        self.generalize = generalize
        self.parallelize = parallelize
        self.write_Entites = write_Entites
        self.nlp_model = English()
        self.nlp_model.max_length = 2000000
        if no_rel_label:
            self.no_rel_label = no_rel_label
        else:
            self.no_rel_label = False
        self.no_rel_multiple = no_rel_multiple

        if de_sample:
            self.de_sample = de_sample
        else:
            self.de_sample = False

        if sentence_align:
            sentencizer = Sentencizer(punct_chars=["\n"])
        else:
            sentencizer = Sentencizer(punct_chars=["\n", ".", "?"])

        if self.write_Entites and self.predictions_folder is not None:
            ext = ".ann"
            file.delete_all_files(predictions_folder, ext)

        self.nlp_model.add_pipe(sentencizer)

        # self.nlp_model = spacy.load('en_core_web_sm')

        # global segmentation object that returns all segments and the label
        self.segments = {
            'seg_preceding': [],
            'seg_concept1': [],
            'seg_concept2': [],
            'seg_concept1_label': [],
            'seg_concept2_label': [],
            'seg_middle': [],
            'seg_succeeding': [],
            'sentence': [],
            'label': [],
            'track': []
        }

        #if parallelize flag is true
        if self.parallelize:
            # Pool object which offers a convenient means of parallelizing the execution of a function
            # across multiple input values, distributing the input data across processes
            pool = Pool(no_of_cores)
            all_args = []
            for datafile, txt_path, ann_path in self.dataset:
                all_args.append([datafile, txt_path, ann_path])
            segments_file = pool.map(self.process_file_parallel, all_args)
            pool.close()
            pool.join()

            # count = 0
            # for i in range(len(segments_file)):
            #     count  = count + len(segments_file[i]['label'])
            # print(count)

            for segment in segments_file:
                # Add lists of segments to the segments object for the dataset
                self.segments['seg_preceding'].extend(segment['preceding'])
                self.segments['seg_concept1'].extend(segment['concept1'])
                self.segments['seg_middle'].extend(segment['middle'])
                self.segments['seg_concept2'].extend(segment['concept2'])
                self.segments['seg_succeeding'].extend(segment['succeeding'])
                self.segments['sentence'].extend(segment['sentence'])
                self.segments['track'].extend(segment['track'])
                # if not self.test:
                self.segments['label'].extend(segment['label'])
                # self.segments['seg_concept1_label'].extend(segment['concept1_label'])
                # self.segments['seg_concept2_label'].extend(segment['concept2_label'])
        else:
            segment = self.process_file_serial(dataset)

            # Add lists of segments to the segments object for the dataset
            self.segments['seg_preceding'].extend(segment['preceding'])
            self.segments['seg_concept1'].extend(segment['concept1'])
            self.segments['seg_middle'].extend(segment['middle'])
            self.segments['seg_concept2'].extend(segment['concept2'])
            self.segments['seg_succeeding'].extend(segment['succeeding'])
            self.segments['sentence'].extend(segment['sentence'])
            self.segments['track'].extend(segment['track'])
            # if not self.test:
            self.segments['label'].extend(segment['label'])
            self.segments['seg_concept1_label'].extend(
                segment['concept1_label'])
            self.segments['seg_concept2_label'].extend(
                segment['concept2_label'])

        if not self.test:
            # print(set(self.segments['label']))
            # print the number of instances of each relation classes
            print([(i, self.segments['label'].count(i))
                   for i in set(self.segments['label'])])

        # write the segments to a file
        file.list_to_file('sentence_test', self.segments['sentence'])
        file.list_to_file('preceding_seg', self.segments['seg_preceding'])
        file.list_to_file('concept1_seg', self.segments['seg_concept1'])
        file.list_to_file('middle_seg', self.segments['seg_middle'])
        file.list_to_file('concept2_seg', self.segments['seg_concept2'])
        file.list_to_file('succeeding_seg', self.segments['seg_succeeding'])
        file.list_to_file('track_test', self.segments['track'])