Beispiel #1
0
def download():
    import os
    from pathlib import Path as path
    HOME_DIR = str(path.home())

    # check nltk_data availability, download if not available
    import nltk
    nltk_rsc = os.path.join(HOME_DIR, 'nltk_data')
    for required in [os.path.join('corpora', 'stopwords.zip'), os.path.join('taggers', 'averaged_perceptron_tagger.zip')]:
        if not os.path.exists(os.path.join(nltk_rsc, required)):
            print('downloading nltk: ', required[:-4])
            nltk.download(os.path.basename(required)[:-4], quiet=True)

    # check stanza_data availability, download if not available
    import stanza
    stanza_rsc = os.path.join(HOME_DIR, 'stanza_resources/en/ner')
    for required in ['anatem.pt', 'bionlp13cg.pt', 'i2b2.pt', 'jnlpba.pt']:
        if not os.path.exists(os.path.join(stanza_rsc, required)):
            print('downloading stanza: ', required[:-3])
            stanza.download('en', package='craft', processors={'ner': required[:-3]}, verbose=False)

    # check benepar_data availability, download if not available
    import benepar
    if not os.path.exists(os.path.join(nltk_rsc, 'models', 'benepar_en3')):
        print('downloading benepar: benepar_en3')
        benepar.download('benepar_en3')
    def __init__(self,
                 lang={
                     'spacy': 'en',
                     'benepar': 'benepar_en2'
                 },
                 config=None):
        super().__init__()
        self.download = False
        # Checking if NLTK sentence and word tokenizers should be downloaded
        if not config_berkeley_nlp['benepar_sent_word_tok_downloaded']:
            spacy.load(lang['spacy'])
            config_global['config_benepar'][
                'benepar_sent_word_tok_downloaded'] = True
            self.download = True
        # Checking if parsing model should be downloaded
        if not config_berkeley_nlp['parsing_model_downloaded']:
            benepar.download(lang['benepar'])
            config_global['config_benepar']['parsing_model_downloaded'] = True
            self.download = True
        # Updating yaml file if necessary
        if self.download:
            with open("./config.yaml", "w") as f:
                yaml.dump(config_global, f)

        self.nlp = spacy.load(lang['spacy'])
        self.nlp.add_pipe(BeneparComponent(lang['benepar']))
        self.sd = StanfordDependencies.get_instance(
            backend='subprocess')  # to convert trees
        self.name_save = 'benepar'
Beispiel #3
0
def _get_nlp(language="en", constituencies=False):
    """
    Get spaCY/benepar with models by language
    """
    import spacy

    language = language.lower()
    model_name = LANGUAGE_TO_MODEL.get(language, language)

    try:
        nlp = spacy.load(model_name)
    except OSError:
        from spacy.cli import download

        download(model_name)
        nlp = spacy.load(model_name)

    if language in BENEPAR_LANGUAGES and constituencies:
        from benepar.spacy_plugin import BeneparComponent

        try:
            nlp.add_pipe(BeneparComponent(BENEPAR_LANGUAGES[language]))
        except LookupError:
            import benepar

            benepar.download(BENEPAR_LANGUAGES[language])
            nlp.add_pipe(BeneparComponent(BENEPAR_LANGUAGES[language]))
            # nlp.add_pipe(nlp.create_pipe("sentencizer"))
    return nlp
Beispiel #4
0
def main():
    input_file, output_file = sys.argv[1:3]
    benepar.download('benepar_en2')
    parser = benepar.Parser("benepar_en2")

    converted_examples = []
    with open(input_file, 'r') as f:
        input_examples = f.readlines()
        for line in tqdm(input_examples):
            converted_examples.append(convert_example(line, parser))
    with open(output_file, 'w') as g:
        g.write("\n".join(converted_examples))
Beispiel #5
0
 def preprocess(self, data):
     import nltk
     nltk.download('punkt')
     import benepar
     benepar.download('benepar_en2_large')
     parser = benepar.Parser("benepar_en2_large")
     from tqdm import tqdm
     progress_bar = tqdm(total=len(data))
     for d in data:
         d['parse_tree'] = parser.parse(d['description'])
         if 'reference_description' in d:
             d['ref_parse_tree'] = parser.parse(
                 d['reference_description']) if len(
                     d['reference_description']) > 0 else d['parse_tree']
         progress_bar.update(1)
     progress_bar.close()
     return data
    def __init__(self,
                 sentiment_package="vader",
                 parse_package="benepar",
                 model_dir="./workspace/models/er_model"):

        self.sentiment_package = sentiment_package
        self.nlp = spacy.load(model_dir)
        self.num_cores = multiprocessing.cpu_count()

        if parse_package == 'benepar':
            try:
                self.parser = benepar.Parser("benepar_en2")
            except LookupError:
                benepar.download('benepar_en2')
                self.parser = benepar.Parser("benepar_en2")
        elif parse_package == 'stanford':
            pass
        else:
            raise Exception('incorrect parse package')
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
test.py
(C) 2020 by Damir Cavar <*****@*****.**>, Semiring Inc.

To run the server with the default port and host, just run this script in the current folder.

The default post is 5000, the default host is localhost.
"""

from flask import Flask
from japi import app

## install benepar this is just for the first time when the user sets up the api
import benepar
benepar.download('benepar_en2')

app.debug = True

if __name__ == "__main__":
    app.run()
def download_models():
    benepar.download('benepar_en2')
    nltk.download('stopwords')
import warnings
warnings.filterwarnings("ignore")

import tensorflow as tf
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)

import spacy
import benepar
benepar.download('benepar_en2', quiet=True)

from benepar.spacy_plugin import BeneparComponent
from compoundBinary import preprocess
from determineInterrogative import sentenceNER
nlp = spacy.load("en_core_web_sm")
nlp.add_pipe(BeneparComponent('benepar_en2'))


# use parse tree to determine which word to replace with interrogative

# assume inputs are results from compound binary questions
# and/or simple binary questions

def generateWhenWhereFromQ(question, entity_dict):
    doc = nlp(question)

    sent = list(doc.sents)[0]
    children = list(sent._.children)
    puncts = '?!.,;:-'
    constituents = list(sent._.constituents)
    stringtotakeout = ''
    questionword = '' 
Beispiel #10
0
_debugger(). This is unfortunately THE way to do it (at least for now), as outlined
in the documentation of benepar (https://pypi.org/project/benepar/):

"Since spaCy does not provide an official constituency parsing API, all methods are
accessible through the extension namespaces Span._ and Token._"

This file is Copyright (c) 2021 Yuzhi Tang, Hongshou Ge, Zheng Luan.
"""
from typing import Any
import benepar
import spacy
from GrammarTree import GrammarTree

# download and load parsing model
spacy.cli.download("en_core_web_md")
benepar.download('benepar_en3')
nlp = spacy.load("en_core_web_md")
nlp.add_pipe("benepar", config={"model": "benepar_en3"})


def translate(text: str) -> [GrammarTree]:
    """Create a list of GrammarTree object (each GrammarTree object represents a sentence)
    based on the input text using the benepar library.

    Precondition:
        - text can only contain letters in the English alphabet and basic
        punctuation marks (e.g. ",", ".", "?", "!").
    """
    grammar_trees = []

    doc = nlp(text)
Beispiel #11
0
import nltk
import benepar

nltk.download('punkt')
benepar.download('benepar_zh')
import scipy

from transformers import GPT2LMHeadModel, GPT2Tokenizer
from sentence_transformers import SentenceTransformer

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained(
    "gpt2")  # we'll use GPT2 to generate sentences
# load BERT model
model_BERT = SentenceTransformer(
    'bert-base-nli-mean-tokens'
)  # we'll use BERT to filter sentences based on similarity
nltk.download("punkt")
nlp = spacy.load("en")
#nltk.load("punkt")
benepar.download("benepar_en2")
benepar_parser = benepar.Parser("benepar_en2")

# load summarizer model
#model = Summarizer()


def clean_text(text):
    """
        Wrapper function to perform any text cleaning 
        that we'd want to do
    """
    text = text.strip(punctuation)
    return text

def main():
  input_conll_file = sys.argv[1]
  benepar.download('benepar_en2')
  parser = benepar.Parser("benepar_en2")
  add_predconst(input_conll_file, parser)
def get_true_false_questions(text, num_questions):

	"""

		Get true/false questions for the specified text
		Args:
			• text: text for which to create questions
			• num_questions: number of questions to create

		Output:
			• question_answers_list: list of questions, where
			each entry is the question + answers for that question

	"""

	# load GPT2 (for generating false sequences) and BERT (for finding sentence similarity of our real sentence against 
	# our fake sentence
	tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
	model = GPT2LMHeadModel.from_pretrained("gpt2") # we'll use GPT2 to generate sentences
	# load BERT model
	model_BERT = SentenceTransformer('bert-base-nli-mean-tokens') # we'll use BERT to filter sentences based on similarity

	# load necessary NLP tools + parser
	nltk.download("punkt")
	nlp = spacy.load("en")
	benepar.download("benepar_en2")
	benepar_parser = benepar.Parser("benepar_en2")

	# clean + split text
	text = clean_text(text)
	cleaned_text = get_sentences(text)
	cleaned_text = [clean_text(x) for x in cleaned_text]

	# use parser to split sentences, remove last verb phrase or last noun phrase
	sentence_completion_dict = get_sentence_completions(cleaned_text)

	# get false sentences
	probability_true = 0.5 # probability that we'll add a True statement, rather than the False statement
	num_fake_sentences = 3 # number of (maximum) fake sentences that we'd like to create for each real partial sentence
	answer_choices = " (a) True  (b) False" # define our answer choices
	question_answers_list = [] # list to hold our questions and answers

	for key_sentence in sentence_completion_dict:

		# get our partial sentence
		partial_sentences_list = sentence_completion_dict[key_sentence]

		# start creating false sentences
		false_sentences = []

		print(f"The number of false sentences that we have for the keyword of ({key_sentence}) is: {len(partial_sentences_list)}")
    
    	# loop through list of partial sentences
		for sentence in partial_sentences_list:

			# create our false sentences
			false_sents = generate_sentences(sentence, key_sentence, num_fake_sentences)
			false_sentences.extend(false_sents)

		print(f"After the for loop through the partial sentences, we have {len(false_sentences)} false sentences")

		for idx, false_sent in enumerate(false_sentences):
        	
			# for each fake option, we now need to decide if we'll use a fake question or a real question

			# return the actual question
			if np.random.uniform() <= probability_true:
				question = f" (ANSWER: True) {key_sentence} : " + answer_choices + "\n" # e.g., "(Answer: True) : 2 + 2 = 4"
			# return the false sentence
			else:
				question = f" (ANSWER: False) {false_sent} : " + answer_choices + "\n" # e.g., "(Answer: False) : 2 + 2 = 5"

        	# add question to question list
			question_answers_list.append(question)

			print(f"We have {len(question_answers_list)} questions in our list")

			if len(question_answers_list) >= num_questions:
				break

	# shuffle our questions
	random.shuffle(question_answers_list)

	# get the first "num_questions" values
	return question_answers_list[:num_questions]
def download_benepar_model(name: str = "benepar_fr") -> None:
    """Download Benepar model `name`."""
    benepar.download(name)
import benepar
import nltk
import pandas as pd
import re
import statistics
import spacy
import benepar
from scipy.stats import sem

nltk.download("punkt")
nltk.download("averaged_perceptron_tagger")
benepar.download("benepar_en3")

spacy_parser = spacy.load("en_core_web_sm")
spacy_parser.add_pipe("benepar", config={"model": "benepar_en3"})

# load datasets from csv files
wsb = pd.read_csv(r"wsb.csv")
ssb = pd.read_csv(r"ssb.csv")
crypto = pd.read_csv(r"crypto.csv")
stocks = pd.read_csv(r"stocks.csv")

ds = pd.concat([wsb, stocks, crypto, ssb], axis=0)


def remove_emojis(sentence):
    pattern = re.compile(
        "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols