コード例 #1
0
def mark_sentence(sen, highlight_index):
    for word, index in zip(sen,range(len(sen))):
        if(index in highlight_index):
            print(" ", colored(word, 'red'), end = "")
        elif(is_punctuation(word)):
            print(word, end= "")
        else:  
            print(" ", word, end= "")
    print("\n")
    return


# lsls: a list of list of words representation
# raw_text: a single string representation of the document
# return: a single string after resolution
def resolve(lsls, raw_text):
    lsls2 = deepcopy(lsls)
    sentence_pos  = label_positions(lsls2) 
    mod = nlp(raw_text) 
    resolution(mod, sentence_pos, raw_text, lsls2)
    resolved = merge_doc(lsls2)
    return resolved

## MACRO
nlp = en_coref_md.load()
DEBUG = 0
PRINT = 0 ## will print before and after resolution text for checking

## For testing
# CNN = resolve_all(fullTextSentences[:2])
# Note: takes ~30min for 1000 documents
コード例 #2
0
 def __init__(self):
     # load the NeuralCoref model, which is built on top of the Spacy english model
     self.nlp = en_coref_md.load()
     # Load our modified tokenizer, better tokenization of biomedical text
     self.nlp.tokenizer = text_utils.biomedical_tokenizer(self.nlp)
コード例 #3
0
# nice way to report running times
@contextmanager
def timer(name):
    t0 = time.time()
    yield
    print(f'[{name}] done in {time.time() - t0:.0f} s')


# it's better to have Spacy nlp model loaded rarely
# contains vocabulary, syntax, word embeddings for the english language
# you first need to install the model (pip install MODEL_URL)
# https://github.com/huggingface/neuralcoref
# then you need to install specific versions of cymem and spacy (see requirements.txt)
with timer('Loading Spacy model'):
    SPACY_MODEL = en_coref_md.load()


def build_features(df, spacy_model=SPACY_MODEL):
    """

    :param df: pandas DataFrame with competition data
    :param spacy_model: loaded Spacy model
    :return: pandas DataFrame with features
    """

    # first we add precomputed spacy docs for each sentence as a new Series
    df['spacy_nlp_doc'] = [
        spacy_model(df['Text'].iloc[i]) for i in range(len(df))
    ]
コード例 #4
0
ファイル: test_text_utils.py プロジェクト: villiedie/saber
def nlp():
    """Returns an instance of a spaCy's nlp object after replacing the default tokenizer with
    our modified one."""
    custom_nlp = en_coref_md.load()
    custom_nlp.tokenizer = text_utils.biomedical_tokenizer(custom_nlp)
    return custom_nlp
コード例 #5
0
def result():
    if request.method == 'POST':
        result = request.form
        x = result['X']
        y = result['Y']

        tree = ET.parse("abbreviation.xml")
        root = tree.getroot()

        for app in root.findall('parts'):
            for l in app.findall('part'):
                if (l.attrib['name']) == x:
                    abbx = l.attrib['abbreviation']
                    print(abbx)
                else:
                    abbx = 0
                if (l.attrib['name']) == y:
                    abby = l.attrib['abbreviation']
                    print(abby)
                else:
                    abby = 0
        with open('bams.csv', 'rt') as f:
            data = csv.reader(f)
            for row in data:
                if (row[0] == str(abbx) and row[1] == str(abby)):
                    print("hi")
                    score = row[2]
                    print(f"BAMS score of the TERMS:", row[2])

        collection.create_index([('entry.summary', 'text')])
        list_x = collection.find({"$text": {"$search": x}})
        list_y = collection.find({"$text": {"$search": y}})
        #list_final = dict(list_x.items() & list_y.items())
        """for item in list_x.keys( ):
			if list_y.has_key(item):
				list_final.append(item)"""

        print(list_x.count())
        print(list_y.count())
        if list_x.count() > list_y.count():
            list_final = [value for value in list_y if value in list_x]

        else:
            list_final = [value for value in list_x if value in list_y]

        print(list_final)

        nlp = en_coref_md.load()
        # split the the text in the articles into sentences
        sentences = []
        for elem in list_final:
            doc = nlp(elem['entry']['summary'])
            sentences.append(sent_tokenize(elem['entry']['summary']))

        # flatten the list
        sentences = [y for x in sentences for y in x]
        # remove punctuations, numbers and special characters
        clean_sentences = pd.Series(sentences).str.replace("[^a-zA-Z]", " ")
        # make alphabets lowercase
        clean_sentences = [s.lower() for s in clean_sentences]
        doc._.has_coref
        doc._.coref_clusters
        doc._.coref_resolved
        nltk.download('stopwords')
        stop_words = stopwords.words('english')

        # function to remove stopwords
        def remove_stopwords(sen):
            sen_new = " ".join([i for i in sen if i not in stop_words])
            return sen_new

        # remove stopwords from the sentences
        clean_sentences = [
            remove_stopwords(r.split()) for r in clean_sentences
        ]

        word_embeddings = {}
        f = open('glove.6B.100d.txt', encoding='utf-8')
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            word_embeddings[word] = coefs
        f.close()

        sentence_vectors = []
        for i in clean_sentences:
            if len(i) != 0:
                v = sum([
                    word_embeddings.get(w, np.zeros((100, )))
                    for w in i.split()
                ]) / (len(i.split()) + 0.001)
            else:
                v = np.zeros((100, ))
            sentence_vectors.append(v)

        len(sentence_vectors)

        sim_mat = np.zeros([len(sentences), len(sentences)])
        for i in range(len(sentences)):
            for j in range(len(sentences)):
                if i != j:
                    sim_mat[i][j] = cosine_similarity(
                        sentence_vectors[i].reshape(1, 100),
                        sentence_vectors[j].reshape(1, 100))[0, 0]

        nx_graph = nx.from_numpy_array(sim_mat)
        scores = nx.pagerank(nx_graph)
        ranked_sentences = sorted(
            ((scores[i], s) for i, s in enumerate(sentences)), reverse=True)
        sn = 1
        for i in range(sn):
            print(ranked_sentences[i][1])

        model = Word2Vec.load("word2vec.model")
        sim = model.similarity(x, y)
        print(sim)
        print(result)

    return render_template("final.html",
                           sim=sim,
                           list_final=list_final,
                           ranked_sentences=ranked_sentences[0][1],
                           x=x,
                           y=y,
                           score=score)
コード例 #6
0
ファイル: __init__.py プロジェクト: indigos33k3r/LILACS
def get_corefnlp():
    nlp = en_coref_md.load()
    return nlp
コード例 #7
0
from openie_wrapper import openie_ie
from stanfordnlp.server import CoreNLPClient
from config import config
import os

patterns = """
    NP:    {<DT><WP><VBP>*<RB>*<VBN><IN><NN>}
           {<NN|NNS|NNP|NNPS><IN>*<NN|NNS|NNP|NNPS>+}
           {<JJ|JJR|JJS>*<NN|NNS|NNP|NNPS><CC>*<NN|NNS|NNP|NNPS>+}
           {<JJ|JJR|JJS>*<NN|NNS|NNP|NNPS>+}
           
    """
print("Getting NP parser..")
NPChunker = nltk.RegexpParser(patterns)
print("Getting coref parser..")
coref_parser = en_coref_md.load()
os.environ["CORENLP_HOME"] = config["CORENLP_HOME"]


def convert_to_tree(text):
    sentences = nltk.sent_tokenize(text)
    sentences = [nltk.word_tokenize(sent) for sent in sentences]
    sentences = [nltk.pos_tag(sent) for sent in sentences]
    sentences = [NPChunker.parse(sent) for sent in sentences]
    return sentences


def get_noun_phrases(text):
    trees = convert_to_tree(text)
    nps = []
    for tree in trees:
コード例 #8
0
ファイル: test_preprocessor.py プロジェクト: villiedie/saber
def nlp():
    """Returns Sacy NLP model."""
    return en_coref_md.load()