def download(): import os from pathlib import Path as path HOME_DIR = str(path.home()) # check nltk_data availability, download if not available import nltk nltk_rsc = os.path.join(HOME_DIR, 'nltk_data') for required in [os.path.join('corpora', 'stopwords.zip'), os.path.join('taggers', 'averaged_perceptron_tagger.zip')]: if not os.path.exists(os.path.join(nltk_rsc, required)): print('downloading nltk: ', required[:-4]) nltk.download(os.path.basename(required)[:-4], quiet=True) # check stanza_data availability, download if not available import stanza stanza_rsc = os.path.join(HOME_DIR, 'stanza_resources/en/ner') for required in ['anatem.pt', 'bionlp13cg.pt', 'i2b2.pt', 'jnlpba.pt']: if not os.path.exists(os.path.join(stanza_rsc, required)): print('downloading stanza: ', required[:-3]) stanza.download('en', package='craft', processors={'ner': required[:-3]}, verbose=False) # check benepar_data availability, download if not available import benepar if not os.path.exists(os.path.join(nltk_rsc, 'models', 'benepar_en3')): print('downloading benepar: benepar_en3') benepar.download('benepar_en3')
def __init__(self, lang={ 'spacy': 'en', 'benepar': 'benepar_en2' }, config=None): super().__init__() self.download = False # Checking if NLTK sentence and word tokenizers should be downloaded if not config_berkeley_nlp['benepar_sent_word_tok_downloaded']: spacy.load(lang['spacy']) config_global['config_benepar'][ 'benepar_sent_word_tok_downloaded'] = True self.download = True # Checking if parsing model should be downloaded if not config_berkeley_nlp['parsing_model_downloaded']: benepar.download(lang['benepar']) config_global['config_benepar']['parsing_model_downloaded'] = True self.download = True # Updating yaml file if necessary if self.download: with open("./config.yaml", "w") as f: yaml.dump(config_global, f) self.nlp = spacy.load(lang['spacy']) self.nlp.add_pipe(BeneparComponent(lang['benepar'])) self.sd = StanfordDependencies.get_instance( backend='subprocess') # to convert trees self.name_save = 'benepar'
def _get_nlp(language="en", constituencies=False): """ Get spaCY/benepar with models by language """ import spacy language = language.lower() model_name = LANGUAGE_TO_MODEL.get(language, language) try: nlp = spacy.load(model_name) except OSError: from spacy.cli import download download(model_name) nlp = spacy.load(model_name) if language in BENEPAR_LANGUAGES and constituencies: from benepar.spacy_plugin import BeneparComponent try: nlp.add_pipe(BeneparComponent(BENEPAR_LANGUAGES[language])) except LookupError: import benepar benepar.download(BENEPAR_LANGUAGES[language]) nlp.add_pipe(BeneparComponent(BENEPAR_LANGUAGES[language])) # nlp.add_pipe(nlp.create_pipe("sentencizer")) return nlp
def main(): input_file, output_file = sys.argv[1:3] benepar.download('benepar_en2') parser = benepar.Parser("benepar_en2") converted_examples = [] with open(input_file, 'r') as f: input_examples = f.readlines() for line in tqdm(input_examples): converted_examples.append(convert_example(line, parser)) with open(output_file, 'w') as g: g.write("\n".join(converted_examples))
def preprocess(self, data): import nltk nltk.download('punkt') import benepar benepar.download('benepar_en2_large') parser = benepar.Parser("benepar_en2_large") from tqdm import tqdm progress_bar = tqdm(total=len(data)) for d in data: d['parse_tree'] = parser.parse(d['description']) if 'reference_description' in d: d['ref_parse_tree'] = parser.parse( d['reference_description']) if len( d['reference_description']) > 0 else d['parse_tree'] progress_bar.update(1) progress_bar.close() return data
def __init__(self, sentiment_package="vader", parse_package="benepar", model_dir="./workspace/models/er_model"): self.sentiment_package = sentiment_package self.nlp = spacy.load(model_dir) self.num_cores = multiprocessing.cpu_count() if parse_package == 'benepar': try: self.parser = benepar.Parser("benepar_en2") except LookupError: benepar.download('benepar_en2') self.parser = benepar.Parser("benepar_en2") elif parse_package == 'stanford': pass else: raise Exception('incorrect parse package')
#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ test.py (C) 2020 by Damir Cavar <*****@*****.**>, Semiring Inc. To run the server with the default port and host, just run this script in the current folder. The default post is 5000, the default host is localhost. """ from flask import Flask from japi import app ## install benepar this is just for the first time when the user sets up the api import benepar benepar.download('benepar_en2') app.debug = True if __name__ == "__main__": app.run()
def download_models(): benepar.download('benepar_en2') nltk.download('stopwords')
import warnings warnings.filterwarnings("ignore") import tensorflow as tf tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR) import spacy import benepar benepar.download('benepar_en2', quiet=True) from benepar.spacy_plugin import BeneparComponent from compoundBinary import preprocess from determineInterrogative import sentenceNER nlp = spacy.load("en_core_web_sm") nlp.add_pipe(BeneparComponent('benepar_en2')) # use parse tree to determine which word to replace with interrogative # assume inputs are results from compound binary questions # and/or simple binary questions def generateWhenWhereFromQ(question, entity_dict): doc = nlp(question) sent = list(doc.sents)[0] children = list(sent._.children) puncts = '?!.,;:-' constituents = list(sent._.constituents) stringtotakeout = '' questionword = ''
_debugger(). This is unfortunately THE way to do it (at least for now), as outlined in the documentation of benepar (https://pypi.org/project/benepar/): "Since spaCy does not provide an official constituency parsing API, all methods are accessible through the extension namespaces Span._ and Token._" This file is Copyright (c) 2021 Yuzhi Tang, Hongshou Ge, Zheng Luan. """ from typing import Any import benepar import spacy from GrammarTree import GrammarTree # download and load parsing model spacy.cli.download("en_core_web_md") benepar.download('benepar_en3') nlp = spacy.load("en_core_web_md") nlp.add_pipe("benepar", config={"model": "benepar_en3"}) def translate(text: str) -> [GrammarTree]: """Create a list of GrammarTree object (each GrammarTree object represents a sentence) based on the input text using the benepar library. Precondition: - text can only contain letters in the English alphabet and basic punctuation marks (e.g. ",", ".", "?", "!"). """ grammar_trees = [] doc = nlp(text)
import nltk import benepar nltk.download('punkt') benepar.download('benepar_zh')
import scipy from transformers import GPT2LMHeadModel, GPT2Tokenizer from sentence_transformers import SentenceTransformer tokenizer = GPT2Tokenizer.from_pretrained("gpt2") model = GPT2LMHeadModel.from_pretrained( "gpt2") # we'll use GPT2 to generate sentences # load BERT model model_BERT = SentenceTransformer( 'bert-base-nli-mean-tokens' ) # we'll use BERT to filter sentences based on similarity nltk.download("punkt") nlp = spacy.load("en") #nltk.load("punkt") benepar.download("benepar_en2") benepar_parser = benepar.Parser("benepar_en2") # load summarizer model #model = Summarizer() def clean_text(text): """ Wrapper function to perform any text cleaning that we'd want to do """ text = text.strip(punctuation) return text
def main(): input_conll_file = sys.argv[1] benepar.download('benepar_en2') parser = benepar.Parser("benepar_en2") add_predconst(input_conll_file, parser)
def get_true_false_questions(text, num_questions): """ Get true/false questions for the specified text Args: • text: text for which to create questions • num_questions: number of questions to create Output: • question_answers_list: list of questions, where each entry is the question + answers for that question """ # load GPT2 (for generating false sequences) and BERT (for finding sentence similarity of our real sentence against # our fake sentence tokenizer = GPT2Tokenizer.from_pretrained("gpt2") model = GPT2LMHeadModel.from_pretrained("gpt2") # we'll use GPT2 to generate sentences # load BERT model model_BERT = SentenceTransformer('bert-base-nli-mean-tokens') # we'll use BERT to filter sentences based on similarity # load necessary NLP tools + parser nltk.download("punkt") nlp = spacy.load("en") benepar.download("benepar_en2") benepar_parser = benepar.Parser("benepar_en2") # clean + split text text = clean_text(text) cleaned_text = get_sentences(text) cleaned_text = [clean_text(x) for x in cleaned_text] # use parser to split sentences, remove last verb phrase or last noun phrase sentence_completion_dict = get_sentence_completions(cleaned_text) # get false sentences probability_true = 0.5 # probability that we'll add a True statement, rather than the False statement num_fake_sentences = 3 # number of (maximum) fake sentences that we'd like to create for each real partial sentence answer_choices = " (a) True (b) False" # define our answer choices question_answers_list = [] # list to hold our questions and answers for key_sentence in sentence_completion_dict: # get our partial sentence partial_sentences_list = sentence_completion_dict[key_sentence] # start creating false sentences false_sentences = [] print(f"The number of false sentences that we have for the keyword of ({key_sentence}) is: {len(partial_sentences_list)}") # loop through list of partial sentences for sentence in partial_sentences_list: # create our false sentences false_sents = generate_sentences(sentence, key_sentence, num_fake_sentences) false_sentences.extend(false_sents) print(f"After the for loop through the partial sentences, we have {len(false_sentences)} false sentences") for idx, false_sent in enumerate(false_sentences): # for each fake option, we now need to decide if we'll use a fake question or a real question # return the actual question if np.random.uniform() <= probability_true: question = f" (ANSWER: True) {key_sentence} : " + answer_choices + "\n" # e.g., "(Answer: True) : 2 + 2 = 4" # return the false sentence else: question = f" (ANSWER: False) {false_sent} : " + answer_choices + "\n" # e.g., "(Answer: False) : 2 + 2 = 5" # add question to question list question_answers_list.append(question) print(f"We have {len(question_answers_list)} questions in our list") if len(question_answers_list) >= num_questions: break # shuffle our questions random.shuffle(question_answers_list) # get the first "num_questions" values return question_answers_list[:num_questions]
def download_benepar_model(name: str = "benepar_fr") -> None: """Download Benepar model `name`.""" benepar.download(name)
import benepar import nltk import pandas as pd import re import statistics import spacy import benepar from scipy.stats import sem nltk.download("punkt") nltk.download("averaged_perceptron_tagger") benepar.download("benepar_en3") spacy_parser = spacy.load("en_core_web_sm") spacy_parser.add_pipe("benepar", config={"model": "benepar_en3"}) # load datasets from csv files wsb = pd.read_csv(r"wsb.csv") ssb = pd.read_csv(r"ssb.csv") crypto = pd.read_csv(r"crypto.csv") stocks = pd.read_csv(r"stocks.csv") ds = pd.concat([wsb, stocks, crypto, ssb], axis=0) def remove_emojis(sentence): pattern = re.compile( "[" u"\U0001F600-\U0001F64F" # emoticons u"\U0001F300-\U0001F5FF" # symbols & pictographs u"\U0001F680-\U0001F6FF" # transport & map symbols