def _get_nlp(language="en", constituencies=False): """ Get spaCY/benepar with models by language """ import spacy language = language.lower() model_name = LANGUAGE_TO_MODEL.get(language, language) try: nlp = spacy.load(model_name) except OSError: from spacy.cli import download download(model_name) nlp = spacy.load(model_name) if language in BENEPAR_LANGUAGES and constituencies: from benepar.spacy_plugin import BeneparComponent try: nlp.add_pipe(BeneparComponent(BENEPAR_LANGUAGES[language])) except LookupError: import benepar benepar.download(BENEPAR_LANGUAGES[language]) nlp.add_pipe(BeneparComponent(BENEPAR_LANGUAGES[language])) # nlp.add_pipe(nlp.create_pipe("sentencizer")) return nlp
def parse_tree_features(df): """ Get features which can be extracted from the parse tree of a text. Adds features: NP_per_sent: NPs (noun phrase) / num of sentences VP_per_sent: VPs (verb phrase) / num of sentences PP_per_sent: PPs (prepositional phrase) / num of sentences SBAR_per_sent: SBARs (subordinate clause) / num of sentences SBARQ_per_sent: SBARQs (direct question introduced by wh-element) / num of sentences avg_NP_size: Average lenght of an NP avg_VP_size: Average lenght of an VP avg_PP_size: Average lenght of an PP avg_parse_tree: Average height of a parse Tree :param: the dataframe with the dataset :returns: the dataframe with the added features """ nlp = spacy.load(SPACY_MODEL, disable=['ner']) nlp.add_pipe(BeneparComponent("benepar_en_small")) # parse text df['B_Tokens'] = df['Text'].apply(lambda x: nlp(x)) # get features df['NP_per_sent'], df['VP_per_sent'], df['PP_per_sent'], \ df['SBAR_per_sent'], df['SBARQ_per_sent'], df['avg_NP_size'], \ df['avg_VP_size'], df['avg_PP_size'], df['avg_parse_tree'] = zip(*df['B_Tokens'].map(_get_parse_tree_features)) # remove B_Tokens df.drop(columns=["B_Tokens"], inplace=True) return df
def __init__(self, lang={ 'spacy': 'en', 'benepar': 'benepar_en2' }, config=None): super().__init__() self.download = False # Checking if NLTK sentence and word tokenizers should be downloaded if not config_berkeley_nlp['benepar_sent_word_tok_downloaded']: spacy.load(lang['spacy']) config_global['config_benepar'][ 'benepar_sent_word_tok_downloaded'] = True self.download = True # Checking if parsing model should be downloaded if not config_berkeley_nlp['parsing_model_downloaded']: benepar.download(lang['benepar']) config_global['config_benepar']['parsing_model_downloaded'] = True self.download = True # Updating yaml file if necessary if self.download: with open("./config.yaml", "w") as f: yaml.dump(config_global, f) self.nlp = spacy.load(lang['spacy']) self.nlp.add_pipe(BeneparComponent(lang['benepar'])) self.sd = StanfordDependencies.get_instance( backend='subprocess') # to convert trees self.name_save = 'benepar'
def preprocess(data_dir, output_fn, batch, threads): print('preparing text') data = all_data(data_dir) print('loading parser') nlp = spacy.load('en_core_web_lg') """ try: nlp = spacy.load('en_core_web_lg') except IOError: spacy.cli.download('en_core_web_lg') nlp = spacy.load('en_core_web_lg') """ nlp.add_pipe(BeneparComponent('benepar_en')) with open(output_fn, 'w') as f: i = 0 for doc in nlp.pipe(data, batch_size=batch, n_threads=threads): # benepar has no pipe() method, so manually invoke the pipe. doc = nlp.get_pipe('benepar')(doc) for sent in doc.sents: if len(sent.text) < MIN_SENTENCE_CHAR_LENGTH: continue if i % 25 == 0: print('%d..' % i, end='') i += 1 f.write(u'< ' + sent.text + '\n') f.write(u'> ' + transform_present_span(sent) + '\n') print('complete')
def main(): nlp = spacy.load('en') nlp.add_pipe(BeneparComponent('benepar_en2_large')) all_negation_examples = read_json(DATA_PATH + 'json/' + DATASET_NAME + '.json') write_path = DATA_PATH + 'conll/gold_cue/' + TASK + '_' + DATASET_NAME + '/' if not os.path.isdir(write_path): os.makedirs(write_path) cv = KFold(n_splits=10, shuffle=True, random_state=0) split_id = 0 for _, valid_id in cv.split(all_negation_examples, all_negation_examples): valid_examples = [all_negation_examples[i] for i in valid_id] with open(write_path + 'train_cv' + str(split_id) + '.conll', 'w') as writer: for example in tqdm(valid_examples): token_list, sentence = get_sentences_and_tokens_from_spacy( example[0], nlp, example[2], example[3]) for token in token_list: write_string = prepare_line(token, example[1]) writer.write(write_string + '\n') writer.write('\n') split_id += 1
def __init__( self, categories: List[str], polarities: List[str], tokenizer: Callable[[str], List[str]] = lambda x: x.split(), token_indexers: Dict[str, TokenIndexer] = None, position_indexers: Dict[str, TokenIndexer] = None, core_nlp: my_corenlp.StanfordCoreNLP = None, configuration=None, bert_tokenizer=None, bert_token_indexers=None, sentence_constituency_indexer: Dict[str, TokenIndexer] = None) -> None: super().__init__(lazy=False) self.tokenizer = tokenizer self.token_indexers = token_indexers or { "tokens": SingleIdTokenIndexer(namespace="tokens") } self.bert_tokenizer = bert_tokenizer self.bert_token_indexers = bert_token_indexers or { "bert": SingleIdTokenIndexer(namespace="bert") } self.position_indexers = position_indexers or { "position": SingleIdTokenIndexer(namespace='position') } self.sentence_constituency_indexer = sentence_constituency_indexer self.categories = categories self.polarities = polarities self.spacy_nlp = spacy.load("en_core_web_sm") self.spacy_nlp.add_pipe(BeneparComponent('benepar_en')) self.core_nlp = core_nlp self.configuration = configuration
def main(): os.environ['CUDA_VISIBLE_DEVICES'] = '-1' # disable GPU args = parse_args() nlp = spacy.load('en', disable=['tagger', 'parser', 'ner']) ConstituencyParser = BeneparComponent("benepar_en2") documents = get_documents(args.documents) for document in documents: with open(document, "r") as f: data = json.load(f) sentences = data["sentences"] gold_spans = data["ent_spans"] gold_span_boundaries = [(span[0], span[1]) for span in gold_spans] syntactic_spans = extract_spans(ConstituencyParser, nlp, sentences) syntactic_spans = [ span for span in syntactic_spans if (span[0], span[1]) not in gold_span_boundaries ] # deduplicate spans = syntactic_spans + gold_spans data["ent_spans"] = spans # write out new spans to copy of document (don't overwrite) out_document = os.path.join(args.output_dir, os.path.basename(document)) with open(out_document, "w") as f: json.dump(data, f, indent=4)
def __init__(self, spacy_mdl='en_core_web_sm', benepar_mdl='benepar_en2', is_segmented=True, is_tokenised=False, batch_size=20, take_sent_average=True, scaler='minmax'): self.nlp = spacy.load(spacy_mdl) self.nlp.add_pipe(BeneparComponent(benepar_mdl), name='benepar') if is_segmented: self.nlp.add_pipe(self._prevent_sbd, name='prevent-sbd', before='parser') self.is_tokenised = is_tokenised if is_tokenised: self.nlp.tokenizer = self.nlp.tokenizer.tokens_from_list self.batch_size = batch_size self.take_sent_average = take_sent_average if scaler == 'minmax': self.scaler = MinMaxScaler() elif scaler == 'standard': self.scaler = StandardScaler() elif scaler is None: self.scaler = None else: raise ValueError( "'scaler' has an unexpected value. Use 'minmax' or 'standard' or None." )
def __init__(self): self.nlp = spacy.load('en_core_web_sm') self.nlp.add_pipe(BeneparComponent("benepar_en_small")) self.dependency = Predictor.from_path( "https://s3-us-west-2.amazonaws.com/allennlp/models/biaffine-dependency-parser-ptb-2018.08.23.tar.gz" ) self.rule_qa2d = rulebased_qa2d.RuleBasedQa2D(self.dependency, self.nlp)
def __init__(self, model_name='en'): load_t0 = time() print("load model file") # load spacy basic model1 self.nlp = spacy.load(model_name) self.nlp.add_pipe(BeneparComponent('benepar_en_small')) load_t1 = time() print('* load model time: {:.2f}ms'.format((load_t1 - load_t0) * 1000))
def preprocessing(language: str): from benepar.spacy_plugin import BeneparComponent import zh_core_web_trf import en_core_web_trf global ucb_parser if language == 'zh': nlp = zh_core_web_trf.load() ucb_parser = BeneparComponent('benepar_zh') elif language == 'en': nlp = en_core_web_trf.load() ucb_parser = BeneparComponent('benepar_en2') else: print('language error') exit(-1) nlp.disable_pipes('tagger', 'parser', 'attribute_ruler') if language == 'en': nlp.disable_pipe('lemmatizer') nlp.add_pipe('component', name='cp_parser', last=True) return nlp
def get_model(spacy_model: str, coref: bool, constparse: bool) -> Language: """Loads a model for a language.""" if spacy_model == 'en': spacy_model = 'en_core_web_sm' nlp = spacy.load(spacy_model) if coref: neuralcoref.add_to_pipe(nlp) if constparse: nlp.add_pipe(BeneparComponent("benepar_en2")) return nlp
def init(): global nlp spacy.tokens.Doc.set_extension('features', default={}, force=True) nlp = spacy.load('en', disable=['nre']) nlp.add_pipe(BeneparComponent("benepar_en_small")) nlp.add_pipe(extract_doc_features, name='extract_doc_features', first=False) test_me()
def make_parse_trees(sents): nlp = spacy.load('en') nlp.add_pipe(BeneparComponent("benepar_en2")) tree_list = [] for s in tqdm(sents): span = list(nlp(s).sents)[0] root_node = Node('ROOT', span) node = span_to_tree(span) root_node.add_child(node) root_node.make_rule() tree_list.append(root_node) return tree_list
def get_model(spacy_model: str, coref: bool, constituents: bool) -> Language: if spacy_model == 'en': spacy_model = 'en_core_web_sm' if spacy_model not in MODEL_NAMES: raise ModuleNotFoundError(f'No such spaCy model "{spacy_model}"') nlp = spacy.load(spacy_model) if coref and spacy_model in COREF: neuralcoref.add_to_pipe(nlp) if constituents: model = CONSTITUENTS.get(spacy_model[:2], "") if model: nlp.add_pipe(BeneparComponent(model)) return nlp
def load_models(spacy_model: str, coref: str = '', constituents: str = '') -> Language: try: nlp = get_model(spacy_model) print('loaded spacy model: ' + spacy_model) except OSError as e: print(e) print('Missing spacy model. Try running: python spacy download ' + spacy_model) print('Defaulting to en_core_web_sm') nlp = spacy.load('en_core_web_sm') if coref: neuralcoref.add_to_pipe(nlp) if constituents: nlp.add_pipe(BeneparComponent(constituents)) return nlp
def preprocess(sentence): clauses = [] nlp = spacy.load("en_core_web_sm") nlp.add_pipe(BeneparComponent('benepar_en2')) doc = nlp(sentence) if len(list(doc.sents)) == 0: return clauses sent = list(doc.sents)[0] children = list(sent._.children) puncts = '?!.,;:-' for clause in children: if clause.text not in puncts: if 'S' in clause._.labels: clauses.append((clause.text + '.')) if not clauses: return [sentence] else: return clauses
def get_clauses(text): nlp = spacy.load('en') nlp.add_pipe(BeneparComponent("benepar_en")) text = text.replace(";", ".") text = text.replace("\n", " ") doc = nlp(text) subsentences = [] for sent in doc.sents: subtexts = create_tree(sent) subtexts = lower_no_punct(subtexts) subsent1 = remove_double_subsents(subtexts) subsent1 = reorder_subsents(sent.text, subsent1) subsent2 = concatenate_sep_words(subsent1) final = capitalize_first_letters(subsent2) subsentences = subsentences + final return subsentences
def parse_file(file_path: str) -> list: def convert_bytes(num): """ this function will convert bytes to MB.... GB... etc """ for x in ['bytes', 'KB', 'MB', 'GB', 'TB']: if num < 1024.0: return "%3.1f %s" % (num, x) num /= 1024.0 def get_file_size(file_path): """ this function will return the file size """ file_info = os.stat(file_path) return convert_bytes(file_info.st_size) file_size = get_file_size(file_path) if re.search(r'[GT]B', file_size): logger.error("File size is %s. This is too large!" % file_size) return [] else: nlp = spacy.load("en_core_web_lg") nlp.add_pipe(BeneparComponent("benepar_en2")) f = open(file_path, "r") chapter = {} for idx,paragraph in enumerate(filter(lambda line: line != "", map(lambda line: line.strip('\n'), f.readlines()))): if idx == 0: title = paragraph chapter["Title"] = title continue else: doc = nlp(paragraph) chapter["Paragraphs"] = chapter.get("Paragraphs", []) + [parse_paragraph(doc)] print(chapter) # grammatical_sentences = list(doc.sents) # print(list(grammatical_sentences[0])) # pdb.set_trace() f.close()
def main(args): if args.cuda: spacy.require_gpu() # Load an spacy model (supported models are "es" and "en") print("Loading spacy...") nlp = spacy.load("en_core_web_lg") print("Done") nlp.tokenizer = lambda text: whitespace_tokenizer(text, nlp.vocab) nlp.add_pipe(WordnetAnnotator(nlp.lang), after="tagger") nlp.add_pipe(BeneparComponent("benepar_en2")) with open(args.data) as f: lines = [line.strip() for line in list(f)] all_texts = [] all_feats = [] docs = nlp.pipe(lines, batch_size=args.batch_size) for doc in tqdm(docs, desc="Extracting feats", total=len(lines)): doc_feats = [] doc_texts = [] for token in doc: t_feats = extract_feats(token) doc_feats.append(t_feats) doc_texts.append(token.text) all_feats.append(doc_feats) all_texts.append(doc_texts) with open(args.data.replace(".tok", ".feats"), "w") as f: f.write("|".join((";".join(fn[:2]) for fn in FEATS))) f.write("\n") for text, doc_feats in zip(all_texts, all_feats): t_feats_joined = ["|".join(tf) for tf in doc_feats] line_feats = " ".join( ["|".join((t, f)) for t, f in zip(text, t_feats_joined)]) f.write(line_feats) f.write("\n")
def get_clauses_df(descriptions): nlp = spacy.load('en') nlp.add_pipe(BeneparComponent("benepar_en")) descriptions = descriptions.assign(clauses="") for i in range(len(descriptions)): text = descriptions.iloc[i, 0] text = text.replace(";", ".") text = text.replace("\n", " ") doc = nlp(text) subsentences = [] for sent in doc.sents: subtexts = create_tree(sent) subtexts = lower_no_punct(subtexts) subsent1 = remove_double_subsents(subtexts) subsent1 = reorder_subsents(sent.text, subsent1) subsent2 = concatenate_sep_words(subsent1) final = capitalize_first_letters(subsent2) subsentences = subsentences + final descriptions.at[i, 'clauses'] = subsentences return descriptions
import tensorflow as tf from models.vdcnn.classifier_protocol import VeryDeepCNN from benepar.spacy_plugin import BeneparComponent from polyglot.text import Text, Word tf.flags.DEFINE_integer('layer_index', None, 'layer index') tf.flags.DEFINE_integer('top_k', 10, '') tf.flags.DEFINE_string('task', None, '') tf.flags.DEFINE_integer('num_align', 10, 'number of concepts to be aligned') tf.flags.DEFINE_integer('num_units', None, '') FLAGS = tf.flags.FLAGS model = VeryDeepCNN(task=FLAGS.task) nlp = spacy.load('en') nlp.add_pipe(BeneparComponent("benepar_en_small")) def lemma_custom(token): if token.lemma_ == '-PRON-': return token.text if token.lemma_ == 'be': return token.text return token.lemma_ def get_layer_name(layer_index): return 'conv_%d' % layer_index
def nlp(scope='module'): import spacy from benepar.spacy_plugin import BeneparComponent nlp = spacy.load('en_core_web_sm') nlp.add_pipe(BeneparComponent('benepar_en')) return nlp
def init_spacy_english_model(): nlp = spacy.load('en_core_web_lg') nlp.add_pipe(BeneparComponent('benepar_en')) return nlp
def __init__(self): self.nlp = spacy.load('en') self.nlp.add_pipe(BeneparComponent("benepar_en2"))
def nlp(): nlp = spacy.load('en_core_web_lg') nlp.add_pipe(BeneparComponent('benepar_en')) return nlp
import spacy nlp = spacy.load("en_core_web_lg") from benepar.spacy_plugin import BeneparComponent nlp.add_pipe(BeneparComponent("benepar_en2")) doc = nlp( u"The alpine wildflowers are in bloom all around us. Truly a magnificent scene. I feel very privileged to be in this place of such jaw-dropping splendour. The one quibble I have with the world in this moment of bliss is that I feel slightly fatigued -- I suppose this is clear evidence, if any more was needed, that I have become physiologically dependent on coffee. Ah, coffee... I would really very much like some coffee. Drugs, loves, doves." ) for sent in list(doc.sents): print(sent._.parse_string) print(sent._.labels) print(list(sent._.constituents)) # for token in sent: # print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_) '''(S (NP (DT The) (JJ alpine) (NNS wildflowers)) (VP (VBP are) (PP (IN in) (NP (NN bloom))) (PP (DT all) (IN around) (NP (PRP us)))) (. .)) ('S',) [The alpine wildflowers are in bloom all around us., The alpine wildflowers, The, alpine, wildflowers, are in bloom all around us, are, in bloom, in, bloom, all around us, all, around, us, .] The the DET DT det alpine alpine ADJ JJ compound wildflowers wildflower NOUN NNS nsubj are be AUX VBP ROOT in in ADP IN prep bloom bloom NOUN NN pobj all all DET DT advmod around around ADP IN prep us -PRON- PRON PRP pobj
node = ConstituencyTreeNode(labels, text, node_id=node_id, start=start, end=end, depth=depth) children = list(sentence._.children) for child in children: node.children.append( ConstituencyTreeNode._inner_parse_using_spacy(child, serial_number, depth=depth + 1)) return node if __name__ == '__main__': nlp = spacy.load('en_core_web_sm') nlp.add_pipe(BeneparComponent('benepar_en2')) sentence = 'Great taste bad service.' tree = ConstituencyTreeNode.parse_using_spacy(nlp, sentence) # adjacency_list = tree.get_adjacency_list() all_nodes = ConstituencyTreeNode.get_all_nodes(tree) adjacency_list = tree.get_adjacency_list_between_all_node_and_leaf() edges = [['%s-%s' % (e2.node_id, e2.text) for e2 in e1] for e1 in adjacency_list] g = nx.DiGraph() g.add_edges_from(edges) pos = nx.kamada_kawai_layout(g) nx.draw(g, pos=pos, with_labels=True) plt.show() print()
import json import fileinput import functools from itertools import groupby import spacy from benepar.spacy_plugin import BeneparComponent from nltk import Tree nlp = spacy.load("en_core_web_sm", disable=["ner"]) benepar = BeneparComponent("benepar_en2") def until_convergence(fn): @functools.wraps(fn) def wrapper(arg, *args, **kwargs): old = object() new = arg while old != new: old = new new = fn(old, *args, **kwargs) return new return wrapper def join_tiny_clauses_with_next(clauses): # [["This", "is", "just"], ["because"], ["it", "is", "so"]] # [["This", "is", "just"], ["because", "it", "is", "so"]] clauses_rev = iter(reversed(clauses))