def mark_sentence(sen, highlight_index): for word, index in zip(sen,range(len(sen))): if(index in highlight_index): print(" ", colored(word, 'red'), end = "") elif(is_punctuation(word)): print(word, end= "") else: print(" ", word, end= "") print("\n") return # lsls: a list of list of words representation # raw_text: a single string representation of the document # return: a single string after resolution def resolve(lsls, raw_text): lsls2 = deepcopy(lsls) sentence_pos = label_positions(lsls2) mod = nlp(raw_text) resolution(mod, sentence_pos, raw_text, lsls2) resolved = merge_doc(lsls2) return resolved ## MACRO nlp = en_coref_md.load() DEBUG = 0 PRINT = 0 ## will print before and after resolution text for checking ## For testing # CNN = resolve_all(fullTextSentences[:2]) # Note: takes ~30min for 1000 documents
def __init__(self): # load the NeuralCoref model, which is built on top of the Spacy english model self.nlp = en_coref_md.load() # Load our modified tokenizer, better tokenization of biomedical text self.nlp.tokenizer = text_utils.biomedical_tokenizer(self.nlp)
# nice way to report running times @contextmanager def timer(name): t0 = time.time() yield print(f'[{name}] done in {time.time() - t0:.0f} s') # it's better to have Spacy nlp model loaded rarely # contains vocabulary, syntax, word embeddings for the english language # you first need to install the model (pip install MODEL_URL) # https://github.com/huggingface/neuralcoref # then you need to install specific versions of cymem and spacy (see requirements.txt) with timer('Loading Spacy model'): SPACY_MODEL = en_coref_md.load() def build_features(df, spacy_model=SPACY_MODEL): """ :param df: pandas DataFrame with competition data :param spacy_model: loaded Spacy model :return: pandas DataFrame with features """ # first we add precomputed spacy docs for each sentence as a new Series df['spacy_nlp_doc'] = [ spacy_model(df['Text'].iloc[i]) for i in range(len(df)) ]
def nlp(): """Returns an instance of a spaCy's nlp object after replacing the default tokenizer with our modified one.""" custom_nlp = en_coref_md.load() custom_nlp.tokenizer = text_utils.biomedical_tokenizer(custom_nlp) return custom_nlp
def result(): if request.method == 'POST': result = request.form x = result['X'] y = result['Y'] tree = ET.parse("abbreviation.xml") root = tree.getroot() for app in root.findall('parts'): for l in app.findall('part'): if (l.attrib['name']) == x: abbx = l.attrib['abbreviation'] print(abbx) else: abbx = 0 if (l.attrib['name']) == y: abby = l.attrib['abbreviation'] print(abby) else: abby = 0 with open('bams.csv', 'rt') as f: data = csv.reader(f) for row in data: if (row[0] == str(abbx) and row[1] == str(abby)): print("hi") score = row[2] print(f"BAMS score of the TERMS:", row[2]) collection.create_index([('entry.summary', 'text')]) list_x = collection.find({"$text": {"$search": x}}) list_y = collection.find({"$text": {"$search": y}}) #list_final = dict(list_x.items() & list_y.items()) """for item in list_x.keys( ): if list_y.has_key(item): list_final.append(item)""" print(list_x.count()) print(list_y.count()) if list_x.count() > list_y.count(): list_final = [value for value in list_y if value in list_x] else: list_final = [value for value in list_x if value in list_y] print(list_final) nlp = en_coref_md.load() # split the the text in the articles into sentences sentences = [] for elem in list_final: doc = nlp(elem['entry']['summary']) sentences.append(sent_tokenize(elem['entry']['summary'])) # flatten the list sentences = [y for x in sentences for y in x] # remove punctuations, numbers and special characters clean_sentences = pd.Series(sentences).str.replace("[^a-zA-Z]", " ") # make alphabets lowercase clean_sentences = [s.lower() for s in clean_sentences] doc._.has_coref doc._.coref_clusters doc._.coref_resolved nltk.download('stopwords') stop_words = stopwords.words('english') # function to remove stopwords def remove_stopwords(sen): sen_new = " ".join([i for i in sen if i not in stop_words]) return sen_new # remove stopwords from the sentences clean_sentences = [ remove_stopwords(r.split()) for r in clean_sentences ] word_embeddings = {} f = open('glove.6B.100d.txt', encoding='utf-8') for line in f: values = line.split() word = values[0] coefs = np.asarray(values[1:], dtype='float32') word_embeddings[word] = coefs f.close() sentence_vectors = [] for i in clean_sentences: if len(i) != 0: v = sum([ word_embeddings.get(w, np.zeros((100, ))) for w in i.split() ]) / (len(i.split()) + 0.001) else: v = np.zeros((100, )) sentence_vectors.append(v) len(sentence_vectors) sim_mat = np.zeros([len(sentences), len(sentences)]) for i in range(len(sentences)): for j in range(len(sentences)): if i != j: sim_mat[i][j] = cosine_similarity( sentence_vectors[i].reshape(1, 100), sentence_vectors[j].reshape(1, 100))[0, 0] nx_graph = nx.from_numpy_array(sim_mat) scores = nx.pagerank(nx_graph) ranked_sentences = sorted( ((scores[i], s) for i, s in enumerate(sentences)), reverse=True) sn = 1 for i in range(sn): print(ranked_sentences[i][1]) model = Word2Vec.load("word2vec.model") sim = model.similarity(x, y) print(sim) print(result) return render_template("final.html", sim=sim, list_final=list_final, ranked_sentences=ranked_sentences[0][1], x=x, y=y, score=score)
def get_corefnlp(): nlp = en_coref_md.load() return nlp
from openie_wrapper import openie_ie from stanfordnlp.server import CoreNLPClient from config import config import os patterns = """ NP: {<DT><WP><VBP>*<RB>*<VBN><IN><NN>} {<NN|NNS|NNP|NNPS><IN>*<NN|NNS|NNP|NNPS>+} {<JJ|JJR|JJS>*<NN|NNS|NNP|NNPS><CC>*<NN|NNS|NNP|NNPS>+} {<JJ|JJR|JJS>*<NN|NNS|NNP|NNPS>+} """ print("Getting NP parser..") NPChunker = nltk.RegexpParser(patterns) print("Getting coref parser..") coref_parser = en_coref_md.load() os.environ["CORENLP_HOME"] = config["CORENLP_HOME"] def convert_to_tree(text): sentences = nltk.sent_tokenize(text) sentences = [nltk.word_tokenize(sent) for sent in sentences] sentences = [nltk.pos_tag(sent) for sent in sentences] sentences = [NPChunker.parse(sent) for sent in sentences] return sentences def get_noun_phrases(text): trees = convert_to_tree(text) nps = [] for tree in trees:
def nlp(): """Returns Sacy NLP model.""" return en_coref_md.load()