def test_issue3289(): """Test that Language.to_bytes handles serializing a pipeline component with an uninitialized model.""" nlp = English() nlp.add_pipe(nlp.create_pipe("textcat")) bytes_data = nlp.to_bytes() new_nlp = English() new_nlp.add_pipe(nlp.create_pipe("textcat")) new_nlp.from_bytes(bytes_data)
def test_issue3449(): nlp = English() nlp.add_pipe(nlp.create_pipe("sentencizer")) text1 = "He gave the ball to I. Do you want to go to the movies with I?" text2 = "He gave the ball to I. Do you want to go to the movies with I?" text3 = "He gave the ball to I.\nDo you want to go to the movies with I?" t1 = nlp(text1) t2 = nlp(text2) t3 = nlp(text3) assert t1[5].text == "I" assert t2[5].text == "I" assert t3[5].text == "I"
def main(): # For simplicity, we start off with only the blank English Language class # and no model or pre-defined pipeline loaded. nlp = English() rest_countries = RESTCountriesComponent(nlp) # initialise component nlp.add_pipe(rest_countries) # add it to the pipeline doc = nlp(u"Some text about Colombia and the Czech Republic") print('Pipeline', nlp.pipe_names) # pipeline contains component name print('Doc has countries', doc._.has_country) # Doc contains countries for token in doc: if token._.is_country: print(token.text, token._.country_capital, token._.country_latlng, token._.country_flag) # country data print('Entities', [(e.text, e.label_) for e in doc.ents]) # entities
def test_issue3468(): """Test that sentence boundaries are set correctly so Doc.is_sentenced can be restored after serialization.""" nlp = English() nlp.add_pipe(nlp.create_pipe("sentencizer")) doc = nlp("Hello world") assert doc[0].is_sent_start assert doc.is_sentenced assert len(list(doc.sents)) == 1 doc_bytes = doc.to_bytes() new_doc = Doc(nlp.vocab).from_bytes(doc_bytes) assert new_doc[0].is_sent_start assert new_doc.is_sentenced assert len(list(new_doc.sents)) == 1
def main(text="Alphabet Inc. is the company behind Google.", *companies): # For simplicity, we start off with only the blank English Language class # and no model or pre-defined pipeline loaded. nlp = English() if not companies: # set default companies if none are set via args companies = ['Alphabet Inc.', 'Google', 'Netflix', 'Apple'] # etc. component = TechCompanyRecognizer(nlp, companies) # initialise component nlp.add_pipe(component, last=True) # add last to the pipeline doc = nlp(text) print('Pipeline', nlp.pipe_names) # pipeline contains component name print('Tokens', [t.text for t in doc]) # company names from the list are merged print('Doc has_tech_org', doc._.has_tech_org) # Doc contains tech orgs print('Token 0 is_tech_org', doc[0]._.is_tech_org) # "Alphabet Inc." is a tech org print('Token 1 is_tech_org', doc[1]._.is_tech_org) # "is" is not print('Entities', [(e.text, e.label_) for e in doc.ents]) # all orgs are entities
def test_issue3209(): """Test issue that occurred in spaCy nightly where NER labels were being mapped to classes incorrectly after loading the model, when the labels were added using ner.add_label(). """ nlp = English() ner = nlp.create_pipe("ner") nlp.add_pipe(ner) ner.add_label("ANIMAL") nlp.begin_training() move_names = ["O", "B-ANIMAL", "I-ANIMAL", "L-ANIMAL", "U-ANIMAL"] assert ner.move_names == move_names nlp2 = English() nlp2.add_pipe(nlp2.create_pipe("ner")) nlp2.from_bytes(nlp.to_bytes()) assert nlp2.get_pipe("ner").move_names == move_names
def test_issue3456(): # this crashed because of a padding error in layer.ops.unflatten in thinc nlp = English() nlp.add_pipe(nlp.create_pipe("tagger")) nlp.begin_training() list(nlp.pipe(["hi", ""]))
similarity_scores = [] for article in tokenized: current_article = [] file = article[0] for tokenized_sentence in article[1]: sentence_embedding = get_vec(tokenized_sentence) score = distance.cosine(cc_embedding, sentence_embedding) if score > 1.0 : score = 1.0 current_article.append(score) similarity_scores.append((file,current_article)) return similarity_scores # spaCy stuff nlp = English() nlp.add_pipe(nlp.create_pipe('sentencizer')) tokenizer = Tokenizer(nlp.vocab) # Split the corpus into sentences articles = [] journals = ["ScienceOCR", "NatureOCR"] for journal in journals: for article in corpus[journal]: file = journal + "/" + article[0] text = article[1] sentences = [] doc = nlp(text) for sent in doc.sents: if sent.orth_ != "\n": s = sent.orth_.replace("\n", "") sentences.append(s)
def test_overfitting_IO(): # Simple test to try and quickly overfit the NEL component - ensuring the ML models work correctly nlp = English() vector_length = 3 assert "Q2146908" not in nlp.vocab.strings # Convert the texts to docs to make sure we have doc.ents set for the training examples train_examples = [] for text, annotation in TRAIN_DATA: doc = nlp(text) train_examples.append(Example.from_dict(doc, annotation)) def create_kb(vocab): # create artificial KB - assign same prior weight to the two russ cochran's # Q2146908 (Russ Cochran): American golfer # Q7381115 (Russ Cochran): publisher mykb = KnowledgeBase(vocab, entity_vector_length=vector_length) mykb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3]) mykb.add_entity(entity="Q7381115", freq=12, entity_vector=[9, 1, -7]) mykb.add_alias( alias="Russ Cochran", entities=["Q2146908", "Q7381115"], probabilities=[0.5, 0.5], ) return mykb # Create the Entity Linker component and add it to the pipeline entity_linker = nlp.add_pipe("entity_linker", last=True) entity_linker.set_kb(create_kb) assert "Q2146908" in entity_linker.vocab.strings assert "Q2146908" in entity_linker.kb.vocab.strings # train the NEL pipe optimizer = nlp.initialize(get_examples=lambda: train_examples) assert entity_linker.model.get_dim("nO") == vector_length assert entity_linker.model.get_dim( "nO") == entity_linker.kb.entity_vector_length for i in range(50): losses = {} nlp.update(train_examples, sgd=optimizer, losses=losses) assert losses["entity_linker"] < 0.001 # adding additional components that are required for the entity_linker nlp.add_pipe("sentencizer", first=True) # Add a custom component to recognize "Russ Cochran" as an entity for the example training data patterns = [{ "label": "PERSON", "pattern": [{ "LOWER": "russ" }, { "LOWER": "cochran" }] }] ruler = nlp.add_pipe("entity_ruler", before="entity_linker") ruler.add_patterns(patterns) # test the trained model predictions = [] for text, annotation in TRAIN_DATA: doc = nlp(text) for ent in doc.ents: predictions.append(ent.kb_id_) assert predictions == GOLD_entities # Also test the results are still the same after IO with make_tempdir() as tmp_dir: nlp.to_disk(tmp_dir) nlp2 = util.load_model_from_path(tmp_dir) assert nlp2.pipe_names == nlp.pipe_names assert "Q2146908" in nlp2.vocab.strings entity_linker2 = nlp2.get_pipe("entity_linker") assert "Q2146908" in entity_linker2.vocab.strings assert "Q2146908" in entity_linker2.kb.vocab.strings predictions = [] for text, annotation in TRAIN_DATA: doc2 = nlp2(text) for ent in doc2.ents: predictions.append(ent.kb_id_) assert predictions == GOLD_entities # Make sure that running pipe twice, or comparing to call, always amounts to the same predictions texts = [ "Russ Cochran captured his first major title with his son as caddie.", "Russ Cochran his reprints include EC Comics.", "Russ Cochran has been publishing comic art.", "Russ Cochran was a member of University of Kentucky's golf team.", ] batch_deps_1 = [doc.to_array([ENT_KB_ID]) for doc in nlp.pipe(texts)] batch_deps_2 = [doc.to_array([ENT_KB_ID]) for doc in nlp.pipe(texts)] no_batch_deps = [ doc.to_array([ENT_KB_ID]) for doc in [nlp(text) for text in texts] ] assert_equal(batch_deps_1, batch_deps_2) assert_equal(batch_deps_1, no_batch_deps)
def convert_to_extractive_driver(args): """ Driver function to convert an abstractive summarization dataset to an extractive dataset. The abstractive dataset must be formatted with two files for each split: a source and target file. Example file list for two splits: ``["train.source", "train.target", "val.source", "val.target"]`` """ # default is to output to input data directory if no output directory specified if not args.base_output_path: args.base_output_path = args.base_path # load spacy english small model with the "tagger" and "ner" disabled since # we only need the "tokenizer" and "parser" # more info: https://spacy.io/usage/processing-pipelines if args.sentencizer: nlp = English() sentencizer = nlp.create_pipe("sentencizer") nlp.add_pipe(sentencizer) else: nlp = spacy.load("en_core_web_sm", disable=["tagger", "ner"]) if args.dataset: dataset = hf_nlp.load_dataset(args.dataset, args.dataset_version) # for each split for name in tqdm( args.split_names, total=len(args.split_names), desc="Dataset Split" ): if args.dataset: # if loading using the `nlp` library current_dataset = dataset[name] source_file = current_dataset[args.data_example_column] target_file = current_dataset[args.data_summarized_column] else: # get the source and target paths source_file_path = os.path.join( args.base_path, (name + "." + args.source_ext) ) target_file_path = os.path.join( args.base_path, (name + "." + args.target_ext) ) logger.info("Opening source and target %s files", name) source_file = open(source_file_path, "r") target_file = open(target_file_path, "r") if args.shard_interval: # if sharding is enabled # get number of examples to process if args.dataset: target_file_len = len(current_dataset) else: target_file_len = sum([1 for line in target_file]) # reset pointer back to beginning after getting length target_file.seek(0) # find how long the loop will run, round up because any extra examples # will form a chunk of size less than `args.shard_interval` tot_num_interations = math.ceil(target_file_len / args.shard_interval) # default is that there was no previous shard (aka not resuming) last_shard = 0 if args.resume: assert ( not args.dataset ), "Cannot resume when using data loaded from the `nlp` library." num_lines_read, last_shard = resume( args.base_output_path, name, args.shard_interval ) # if lines have been read and shards have been written to disk if num_lines_read: logger.info("Resuming to line %i", num_lines_read - 1) # seek both the source and target to the next line seek_files([source_file, target_file], num_lines_read - 1) # checks to make sure the last documents match # this moves the file pointer in source_file forward 1... resume_success = check_resume_success( nlp, args, source_file, last_shard, args.base_output_path, name, args.compression, ) # ...so move the target_file pointed forward 1 as well target_file.readline() if not resume_success: logger.error("Exiting...") sys.exit(-1) # subtract the number of shards already created tot_num_interations -= int(last_shard) else: # no shards on disk logger.warning("Tried to resume but no shards found on disk") for piece_idx, (source_docs, target_docs) in tqdm( enumerate( zip( read_in_chunks(source_file, args.shard_interval), read_in_chunks(target_file, args.shard_interval), ) ), total=tot_num_interations, desc="Shards", ): piece_idx += last_shard # effective if resuming (offsets the index) convert_to_extractive_process( args, nlp, source_docs, target_docs, name, piece_idx ) else: # only `str.strip()` the lines if loading from an actual file, not # the `nlp` library if args.dataset: source_docs = source_file target_docs = target_file else: source_docs = [line.strip() for line in source_file] target_docs = [line.strip() for line in target_file] convert_to_extractive_process(args, nlp, source_docs, target_docs, name) # If not processing data from the `nlp` library then close the loaded files if not args.dataset: source_file.close() target_file.close()
import zipfile import tempfile from spacy.lang.en import English from dstc_utilities import * nlp = English() sentencizer = nlp.create_pipe("sentencizer") nlp.add_pipe(sentencizer) # DSTC3 archive directory archive_dir = 'dstc_archive' # Processed data directory data_dir = os.path.join('dstc_data', 'json') remove_words = ['sil', 'unitelligible', 'unintelligible', 'background', 'noise', 'nosie', 'cough', 'coughing', 'laughing', 'breathing', 'click', 'clicking', 'knock', 'knocking', 'system', 'dog', 'barking', 'whisper', 'throat', 'clear', 'clearing', 'throatnoise', 'whisperingunintelligible'] sets = ['train', 'test'] # Create a temporary directory and unzip the archived data with tempfile.TemporaryDirectory(dir=archive_dir) as tmp_dir: # Load into temp directory zip_file = zipfile.ZipFile(os.path.join(archive_dir, 'dstc3_archive.zip'), 'r') zip_file.extractall(tmp_dir) zip_file.close() for dataset_name in sets: # Get a list of all the dialogues set_list = os.listdir(os.path.join(tmp_dir, dataset_name))
def main(): # Setup Cuda if available, otherwise use the CPU device = -1 if torch.cuda.is_available(): device = torch.cuda.current_device() # Put data path here data_path = "/Users/bencullen/Projects/StoryGrapher/text_data/test.txt" save_path = "/Users/bencullen/Projects/StoryGrapher/output/triples/" data_name = data_path.split('/')[-1] # Generate Models print("Generating models...") openie_model_url = "https://storage.googleapis.com/allennlp-public-models/openie-model.2020.03.26.tar.gz" openie_predictor = Predictor.from_path(openie_model_url, cuda_device=device) print("Generated openie predictor") spacy_sent = English() spacy_sent = spacy.load('en_core_web_sm') spacy_sent.add_pipe(spacy_sent.create_pipe('sentencizer')) print("Generated Spacy Sentencizer") print("Finished generating models") sentences = [] all_triples = [] openie_raw_json = [] sent_parts_of_speech = [] sent_structure = [] sent_roots = [] selected_triples = [] verb_tenses = [] trimmed_triples = [] # Split text data into sentences all_sentences = get_all_sentences(spacy_sent, data_path) t = time.localtime() timestamp = time.strftime('%b-%d-%y_%H:%M', t) remove_bad_triples = True good_triples = 0 total_triples = len(all_sentences) # print("Doing co-reference analysis") # coref_data = get_coref_prediction(coref_predictor, text_data) for sent in all_sentences: print('Processing sentence:', sent.text) sentences.append(sent) sent_pos = get_sent_pos_string(sent) print(sent_pos) sent_parts_of_speech.append(sent_pos) sent_dep = get_sent_dep(sent) print(sent_dep) sent_structure.append(sent_dep) sent_root = get_root_verb(sent) # print("Root Verb:", sent_root) sent_roots.append(sent_root) # Extract a triple using OpenIE openie_result = create_openie_triple(openie_predictor, sent.text.strip()) openie_raw_json.append(openie_result) openie_str = get_triple_string_from_json(openie_result) all_triples.append(openie_str) # print("Openie triples:", openie_str) relevant_triple = get_relevant_triple(openie_result, sent_root) selected_triples.append(relevant_triple) # print("Selected Triple", str(relevant_triple)) verb_tense = get_verb_tense(sent, relevant_triple) # print("Triple's Tense:", verb_tense) verb_tenses.append(verb_tense) trimmed = trim_triple(spacy_sent, relevant_triple) # print("Trimmed Triple:", trimmed, "\n") trimmed_triples.append(trimmed) print("\n") if remove_bad_triples == True: if None in trimmed: sentences.pop() sent_parts_of_speech.pop() sent_structure.pop() sent_roots.pop() openie_raw_json.pop() all_triples.pop() selected_triples.pop() verb_tenses.pop() trimmed_triples.pop() else: good_triples += 1 # Put sentence and triple data into a pandas dataframe for processing triples_data = pd.DataFrame({ 'Sentences': sentences, 'Sentence Parts of speech': sent_parts_of_speech, 'Sentence Dependenceies': sent_structure, 'Extracted Triples': all_triples, 'Extraction JSON': openie_raw_json, 'Root Verb': sent_roots, 'Selected Triple': selected_triples, 'Triple\'s Verb Tense': verb_tenses, 'Trimmed Triple': trimmed_triples }) print(good_triples, " triples of total triples ", total_triples, " were good") # Store the DataFrame into a csv file for examination triples_data.to_csv( os.path.join(save_path + data_name + '_triples_ ' + timestamp + '.csv'))
def test_positive_class_not_present(): nlp = English() textcat = nlp.add_pipe("textcat") get_examples = make_get_examples_single_label(nlp) with pytest.raises(ValueError): textcat.initialize(get_examples, labels=["SOME", "THING"], positive_label="POS")
import subprocess from spacy.lang.en import English import os import json import fire # Spacy model for sentence segmentation (among other things, but we only do segmentation) nlp_lg = spacy.load("en_core_web_lg") # nlp_lg.max_length = 1500000 # Also spacy, but using the rule-based sentencizer # Note: we use this for bookcorpus, since the dependency-parser-based sentencizer # tends to put quotation marks as their own sentences, and bookcorpus has a lot # of quotations. nlp = English() nlp.add_pipe(nlp.create_pipe("sentencizer")) # Increase the max length of documents from 1M to 14M characters for this # sentencizer. Books are long. The longest book in bookcorpus is: # 13961563 out_txts/682810__debunkanji-chinese-glyphs-used-in-japanese.txt # We can do this because we're using the rule-based sentencizer, not the # parser, so we don't need as much memory. nlp.max_length = 14000000 def write_doc(doc): """From the BERT readme: The input is a plain text file, with one sentence per line. (It is important that these be actual sentences for the "next sentence prediction" task). Documents are delimited by empty lines. """ for sent in doc.sents:
def generate_rules(patterns): nlp = English() ruler = EntityRuler(nlp) ruler.add_patterns(patterns) nlp.add_pipe(ruler) nlp.to_disk("hp_ner")
def main(): parser = argparse.ArgumentParser() add_arg = parser.add_argument add_arg('-i', dest='iterations', type=int, default=ITER, help='Number of iteration wanted, default value %d' % ITER) add_arg('-o', dest='output', type=str, default=OUTPUT_DIR, help='Output directory path, default %s' % OUTPUT_DIR) add_arg('-l', dest='load', type=bool, default=False, help='Load pretrained Spacy model for English, default False') args = parser.parse_args() print("Load term's list") terms_corpus = pd.read_excel('astronomy.xls') # the list containing the pharses to be matched terminology_list = [] for term in terms_corpus['key']: terminology_list.append(term[term.find(':') + 2:]) print("Read the corpus files...") read_files = glob.glob("corpus/Astromony_*.txt") with open("corpus/result.txt", "wb") as outfile: for f in progressbar(read_files): with open(f, "rb") as infile: outfile.write(infile.read()) # the input text string is converted to a Document object file = open('corpus/result.txt') text = file.read() nlp_rule_based = English() ruler = EntityRuler(nlp_rule_based) # create patterns patterns = [] for term in terminology_list: dct = {} temp = term.split() if len(temp) == 1: dct["label"] = "AstroTerm" dct["pattern"] = temp[0] patterns.append(dct) else: lst = [] for item in temp: dct_temp = {} dct_temp["lower"] = item lst.append(dct_temp) dct["label"] = "AstroTerm" dct["pattern"] = lst patterns.append(dct) # add patterns and pipe ruler.add_patterns(patterns) nlp_rule_based.add_pipe(ruler) # generate annotaeted data print("Generate annotated data...") train_data = [] for doc in progressbar(nltk.tokenize.sent_tokenize(text)): doc = nlp_rule_based(doc) train_data.append(extract_entities(doc)) # to train the model set 'train' to true train_spacy(train_data, args.iterations, args.load, args.output)
def split_spacy_en(text): nlp_e = English() nlp_e.add_pipe(nlp_e.create_pipe('sentencizer')) return prepare_spacy(text, nlp_e)
def test_lemmatizer_requires_labels(): nlp = English() nlp.add_pipe("trainable_lemmatizer") with pytest.raises(ValueError): nlp.initialize()
def generate_rules(pattern): nlp = English() ruler = nlp.add_pipe("entity_ruler") ruler.add_patterns(patterns) nlp.to_disk("hp_ner")
def main(): # Setup Cuda if available, otherwise use the CPU device = -1 if torch.cuda.is_available(): device = torch.cuda.current_device() # Put data path here data_path = "data/raw/anne_bonnie.txt" save_path = "data/triples/" data_name = data_path.split('/')[-1] # Generate Models print("Generating models...") openie_model_url = "https://storage.googleapis.com/allennlp-public-models/openie-model.2020.03.26.tar.gz" openie_predictor = Predictor.from_path(openie_model_url, cuda_device=device) print("Generated openie predictor") spacy_sent = English() spacy_sent = spacy.load('en_core_web_sm') spacy_sent.add_pipe(spacy_sent.create_pipe('sentencizer')) print("Generated Spacy Sentencizer") print("Finished generating models") sentences = [] trimmed_triples = [] # Split text data into sentences all_sentences = get_all_sentences(spacy_sent, data_path) t = time.localtime() timestamp = time.strftime('%b-%d-%y_%H:%M', t) remove_bad_triples = True good_triples = 0 total_triples = len(all_sentences) # print("Doing co-reference analysis") # coref_data = get_coref_prediction(coref_predictor, text_data) for sent in all_sentences: print('Processing sentence:', sent.text) sentences.append(sent) # Get the root of the sentence sent_root = get_root_verb(sent) # print("Root Verb:", sent_root) # Extract a triple using OpenIE openie_result = create_openie_triple(openie_predictor, sent.text.strip()) # Get releveant triple relevant_triple = get_relevant_triple(openie_result, sent_root) # print("Selected Triple", str(relevant_triple)) # Tri the triple trimmed = trim_triple(spacy_sent, relevant_triple) trimmed_triples.append(trimmed) print("Trimmed Triple:", trimmed, "\n") if remove_bad_triples == True: if None in trimmed: sentences.pop() trimmed_triples.pop() else: good_triples += 1 # Put sentence and triple data into a pandas dataframe for exporting triples_data = pd.DataFrame({ 'Sentence': sentences, 'Trimmed Triple': trimmed_triples }) print(good_triples, " triples of total ", total_triples, " triples were extracted") # Store the DataFrame into a csv file for examination triples_data.to_csv( os.path.join(save_path + data_name + '_triples_ ' + timestamp + '.csv')) # Create graph object G = nx.Graph() file_name = data_name + ' Graph ' + timestamp # Add nodes to graph and connect images for triple in trimmed_triples: G.add_edge(triple[0], triple[1]) G.add_edge(triple[1], triple[2]) # Create graph picture pos = nx.spring_layout(G) fig = plt.figure(figsize=(45, 45)) fig.suptitle(file_name) nx.draw(G, pos, edge_color='black', width=1, linewidths=1, node_size=1000, node_color='seagreen', alpha=0.9, labels={node: node for node in G.nodes()}) # Save the graph as a picture plt.savefig( os.path.join(save_path + data_name + '_graph_' + timestamp + '.png'))
# external libraries import numpy as np import math import torch import torch.nn.functional as F from spacy.lang.en import English # internal utilities import config tokenizer = English() tokenizer.add_pipe(tokenizer.create_pipe("sentencizer")) device = torch.device("cuda" if config.cuda else "cpu") def clean_text(text): text = text.replace("]", " ] ") text = text.replace("[", " [ ") text = text.replace("\n", " ") text = text.replace("''", '" ').replace("``", '" ') return text def word_tokenize(text): tokens = [token.text for token in tokenizer(text) if token.text] tokens = [t for t in tokens if t.strip("\n").strip()] return tokens def sent_tokenize(text):
def spacy_sbd(text): nlp = English() nlp.add_pipe("sentencizer") doc = nlp(text) test = list(doc.sents) return [str(i) for i in test]
parser = argparse.ArgumentParser( 'Generate raw sentences file for consumption by open IE 6.') parser.add_argument('--dataset', default='squad', help='trivia_qa or hotpot_qa') parser.add_argument('-debug', default=False, action='store_true', help='If true, run on tiny portion of train dataset') args = parser.parse_args() update_incr = 10 if args.debug else 10000 print('Loading Spacy...') spacy_nlp = English() # just the language with no model sentencizer = spacy_nlp.create_pipe('sentencizer') spacy_nlp.add_pipe(sentencizer) print('Done Loading Spacy...') data_dir = os.path.join('..', 'data', args.dataset) if not os.path.exists(data_dir): os.mkdir(data_dir) if args.dataset == 'squad': dtypes = ['mini'] if args.debug else ['train', 'validation'] else: dtypes = ['mini'] if args.debug else ['train', 'test', 'validation'] for dtype in dtypes: start_time = time() generate_ie_input(dtype, data_dir) duration(start_time)
class CrazyTokenizer(object): """ Tokenizer with Reddit- and Twitter-specific options Parameters ---------- lowercase : bool, optional If True, lowercase all tokens. Defaults to True. keepcaps: bool, optional If True, keep ALL CAPS WORDS uppercased. Defaults to False. normalize: int or bool, optional If not False, perform normalization of repeated charachers ("awesoooooome" -> "awesooome"). The value of parameter determines the number of occurences to keep. Defaults to 3. ignore_quotes: bool, optional If True, ignore tokens contained within double quotes. Defaults to False. ignore_reddit_quotes: bool, optional If True, remove quotes from the Reddit comments. Defaults to False. ignore_stopwords: str, list, or boolean, optional Whether to ignore stopwords - str: language to get a list of stopwords for from NLTK package - list: list of stopwords to remove - True: use built-in list of the english stop words - False: keep all tokens Defaults to False stem: {False, 'stem', 'lemm'}, optional Whether to perform word stemming - False: do not perform word stemming - 'stem': use PorterStemmer from NLTK package - 'lemm': use WordNetLemmatizer from NLTK package remove_punct: bool, optional If True, remove punctuation tokens. Defaults to True. remove_breaks: bool, optional If True, remove linebreak tokens. Defaults to True. decontract: bool, optional If True, attempt to expand certain contractions. Defaults to False. Example: "'ll" -> " will" numbers, subreddits, reddit_usernames, emails: False or str, optional Replacement of the different types of tokens - False: leaves these tokens intact - str: replacement token - '': removes all occurrences of these tokens twitter_handles: False, 'realname' or str, optional Processing of twitter handles - False: do nothing - str: replacement token - 'realname': replace with the real screen name of Twitter account - 'split': split handles using Viterbi algorithm Example: "#vladimirputinisthebest" -> "vladimir putin is the best" hashtags: False or str, optional Processing of hashtags - False: do nothing - str: replacement token - 'split': split hashtags according using Viterbi algorithm urls: False or str, optional Replacement of parsed URLs - False: leave URL intact - str: replacement token - dict: replace all URLs stored in keys with the corresponding values - '': removes all occurrences of these tokens - 'domain': extract domain ("http://cnn.com" -> "cnn") - 'domain_unwrap_fast': extract domain after unwraping links for a list of URL shorteners (goo.gl, t.co, bit.ly, tinyurl.com) - 'domain_unwrap': extract domain after unwraping all links - 'title': extract and tokenize title of each link after unwraping it Defaults to False. extra_patterns: None or list of tuples, optional Replacement of any user-supplied extra patterns. Tuples must have the following form: (name, re_pattern, replacement_token): - name (str): name of the pattern - re_pattern (_sre.SRE_Pattern): compiled re pattern - replacement_token (str): replacement token Defaults to None keep_untokenized: None or list, optional List of expressions to keep untokenized Example: ["New York", "Los Angeles", "San Francisco"] whitespaces_to_underscores: boolean, optional If True, replace all whitespace characters with underscores in the final tokens. Defaults to True. remove_nonunicode: boolean, optional If True, remove all non-unicode characters. Defaults to False. pos_emojis, neg_emojis, neutral_emojis: None, True, or list, optional Replace positive, negative, and neutral emojis with the special tokens - None: do not perform replacement - True: perform replacement of the default lists of emojis - list: list of emojis to replace print_url_warnings: bool, optional If True, print URL-related warnings. Defaults to False. latin_chars_fix: bool, optional Try applying this fix if you have a lot of \\xe2\\x80\\x99-like or U+1F601-like strings in your data. Defaults to False. ngrams: int, optional Add ngrams of tokens after tokenizing """ def __init__(self, lowercase=True, keepcaps=False, normalize=3, ignore_quotes=False, ignore_reddit_quotes=False, ignore_stopwords=False, stem=False, remove_punct=True, remove_breaks=True, decontract=False, twitter_handles=False, urls=False, hashtags=False, numbers=False, subreddits=False, reddit_usernames=False, emails=False, extra_patterns=None, keep_untokenized=None, whitespaces_to_underscores=True, remove_nonunicode=False, remove_numbers=False, pos_emojis=None, neg_emojis=None, neutral_emojis=None, print_url_warnings=False, latin_chars_fix=False, ngrams=1, wordnet=None, porterstem=None): self.params = locals() self._nlp = English() self._merging_matcher = Matcher(self._nlp.vocab) self._matcher = Matcher(self._nlp.vocab) self._replacements = {} self._domains = {} self._realnames = {} self._stopwords = None self.wordnet = wordnet alpha_digits_flag = self._nlp.vocab.add_flag(alpha_digits_check) hashtag_flag = self._nlp.vocab.add_flag(hashtag_check) twitter_handle_flag = self._nlp.vocab.add_flag(twitter_handle_check) self._merging_matcher.add('HASHTAG', None, [{ 'ORTH': '#' }, { 'IS_ASCII': True }]) self._merging_matcher.add('SUBREDDIT', None, [{ 'ORTH': '/r' }, { 'ORTH': '/' }, { alpha_digits_flag: True }], [{ 'ORTH': 'r' }, { 'ORTH': '/' }, { alpha_digits_flag: True }]) self._merging_matcher.add('REDDIT_USERNAME', None, [{ 'ORTH': '/u' }, { 'ORTH': '/' }, { alpha_digits_flag: True }], [{ 'ORTH': 'u' }, { 'ORTH': '/' }, { alpha_digits_flag: True }]) if isinstance(ignore_stopwords, str) and ('nltk' in sys.modules): try: self._stopwords = stopwords.words(ignore_stopwords) except OSError: raise ValueError('Language {} was not found by NLTK'.format( ignore_stopwords)) elif ignore_stopwords is True: self._matcher.add('STOPWORDS', self._remove_token, [{ 'IS_STOP': True }]) elif isinstance(ignore_stopwords, list): self._stopwords = [word.lower() for word in ignore_stopwords] elif ignore_stopwords is not False: raise TypeError( 'Type {} is not supported by ignore_stopwords parameter or NLTK is not installed' .format(type(ignore_stopwords))) if lowercase and (not keepcaps): self._matcher.add('LOWERCASE', self._lowercase, [{ 'IS_LOWER': False }]) elif lowercase and keepcaps: self._matcher.add('LOWERCASE', self._lowercase, [{ 'IS_LOWER': False, 'IS_UPPER': False }]) if remove_punct: self._matcher.add('PUNCTUATION', self._remove_token, [{ 'IS_PUNCT': True }]) if remove_numbers: self._matcher.add('NUMBERS', self._remove_token, [{ 'LIKE_NUM': True }]) if remove_breaks: def break_check(text): return bool(BREAKS_RE.fullmatch(text)) break_flag = self._nlp.vocab.add_flag(break_check) self._matcher.add('BREAK', self._remove_token, [{ break_flag: True }]) if normalize: def normalize_check(text): return bool(NORMALIZE_RE.search(text)) normalize_flag = self._nlp.vocab.add_flag(normalize_check) self._matcher.add('NORMALIZE', self._normalize, [{ normalize_flag: True }]) if numbers is not False: self._matcher.add('NUMBER', self._replace_token, [{ 'LIKE_NUM': True }]) self._replacements['NUMBER'] = numbers if urls is not False: if urls in [ 'domain', 'domain_unwrap_fast', 'domain_unwrap', 'title' ]: self._urls = urls self._matcher.add('URL', self._process_url, [{ 'LIKE_URL': True }]) elif isinstance(urls, dict): self._domains = urls self._urls = 'domain_unwrap_fast' self._matcher.add('URL', self._process_url, [{ 'LIKE_URL': True }]) else: self._matcher.add('URL', self._replace_token, [{ 'LIKE_URL': True }]) self._replacements['URL'] = urls if emails is not False: self._matcher.add('EMAIL', self._replace_token, [{ 'LIKE_EMAIL': True }]) self._replacements['EMAIL'] = emails if reddit_usernames is not False: def reddit_username_check(text): return bool(REDDITORS_RE.fullmatch(text)) reddit_username_flag = self._nlp.vocab.add_flag( reddit_username_check) self._matcher.add('REDDIT_USERNAME', self._replace_token, [{ reddit_username_flag: True }]) self._replacements['REDDIT_USERNAME'] = reddit_usernames if subreddits is not False: def subreddit_check(text): return bool(SUBREDDITS_RE.fullmatch(text)) subreddit_flag = self._nlp.vocab.add_flag(subreddit_check) self._matcher.add('SUBREDDIT', self._replace_token, [{ subreddit_flag: True }]) self._replacements['SUBREDDIT'] = subreddits if twitter_handles is not False: self._matcher.add('TWITTER_HANDLE', self._handles_postprocess, [{ twitter_handle_flag: True }]) if hashtags is not False: self._matcher.add('HASHTAG', self._hashtag_postprocess, [{ hashtag_flag: True }]) if hashtags == 'split' or twitter_handles == 'split': file = os.path.join(DATA_PATH, 'wordsfreq_wiki2.txt') with open(file) as f: self._words = f.read().split() self._wordcost = dict((k, log((i + 1) * log(len(self._words)))) for i, k in enumerate(self._words)) self._maxword = max(len(x) for x in self._words) if twitter_handles == 'realname': with open(os.path.join(DATA_PATH, 'realnames.json')) as f: self._realnames = json.load(f) if ignore_quotes: self._merging_matcher.add('QUOTE', None, [{ 'ORTH': '"' }, { 'OP': '*', 'IS_ASCII': True }, { 'ORTH': '"' }]) def doublequote_check(text): return bool(QUOTES_RE.fullmatch(text)) doublequote_flag = self._nlp.vocab.add_flag(doublequote_check) self._matcher.add('DOUBLE_QUOTES', self._remove_token, [{ doublequote_flag: True }]) if self._stopwords: def stopword_check(text): return bool(text.lower() in self._stopwords) stopword_flag = self._nlp.vocab.add_flag(stopword_check) self._matcher.add('STOPWORD', self._remove_token, [{ stopword_flag: True }]) if keep_untokenized is not None: if not isinstance(keep_untokenized, list): raise ValueError( "keep_untokenized has to be either None or a list") for i, phrase in enumerate(keep_untokenized): phrase_tokens = phrase.split(' ') rule = [] for token in phrase_tokens: rule.append({'LOWER': token.lower()}) self._merging_matcher.add('RULE_' + str(i), None, rule) if pos_emojis: if not isinstance(pos_emojis, list): pos_emojis = POS_EMOJIS pos_patterns = [[{'ORTH': emoji}] for emoji in pos_emojis] self._matcher.add('HAPPY', self._replace_token, *pos_patterns) self._replacements['HAPPY'] = 'POS_EMOJI' if neg_emojis: if not isinstance(neg_emojis, list): neg_emojis = NEG_EMOJIS neg_patterns = [[{'ORTH': emoji}] for emoji in neg_emojis] self._matcher.add('SAD', self._replace_token, *neg_patterns) self._replacements['SAD'] = 'NEG_EMOJI' if neutral_emojis: if not isinstance(neutral_emojis, list): neutral_emojis = NEUTRAL_EMOJIS neutral_patterns = [[{'ORTH': emoji}] for emoji in neutral_emojis] self._matcher.add('NEUTRAL', self._replace_token, *neutral_patterns) self._replacements['NEUTRAL'] = 'NEUTRAL_EMOJI' if isinstance(extra_patterns, list): self._flags = {} for name, re_pattern, replacement_token in extra_patterns: def flag(text): return bool(re_pattern.search(text)) self._flags[name] = self._nlp.vocab.add_flag(flag) self._matcher.add(name, self._replace_token, [{ self._flags[name]: True }]) self._replacements[name] = replacement_token if stem and ('nltk' in sys.modules): if stem == 'stem': self._stemmer = porterstem elif stem == 'lemm': # self._stemmer = wordnetlem pass else: raise ValueError( 'Stemming method {} is not supported'.format(stem)) self._matcher.add('WORD_TO_STEM', self._stem_word, [{ 'IS_ALPHA': True }]) retokenize_flag = self._nlp.vocab.add_flag(retokenize_check) self._matcher.add('RETOKENIZE', self._retokenize, [{ retokenize_flag: True, 'IS_PUNCT': False, 'LIKE_URL': False, 'LIKE_EMAIL': False, 'LIKE_NUM': False, hashtag_flag: False, twitter_handle_flag: False }]) self._nlp.add_pipe(self._merge_doc, name='merge_doc', last=True) self._nlp.add_pipe(self._match_doc, name='match_doc', last=True) self._nlp.add_pipe(self._postproc_doc, name='postproc_doc', last=True) @staticmethod def _lowercase(__, doc, i, matches): # Lowercase tokens __, start, end = matches[i] span = doc[start:end] for tok in span: tok._.transformed_text = tok._.transformed_text.lower() def _stem_word(self, __, doc, i, matches): # Stem tokens __, start, end = matches[i] span = doc[start:end] for tok in span: if self.params['stem'] == 'stem': tok._.transformed_text = self._stemmer.stem( tok._.transformed_text) elif self.params['stem'] == 'lemm': tok._.transformed_text = self.lemmatize(tok._.transformed_text) def lemmatize(self, word, pos=NOUN): lemmas = self.wordnet._morphy(word, pos) return min(lemmas, key=len) if lemmas else word def _normalize(self, __, doc, i, matches): # Normalize repeating symbols __, start, end = matches[i] span = doc[start:end] for tok in span: tok._.transformed_text = NORMALIZE_RE.sub( r"\1" * self.params['normalize'], tok._.transformed_text) def _process_url(self, __, doc, i, matches): # Process found URLs __, start, end = matches[i] span = doc[start:end] for tok in span: found_urls = URLS_RE.findall(tok.text) if found_urls: if found_urls[0] in self._domains: tok._.transformed_text = self._domains[found_urls[0]] elif self._urls == 'domain': tok._.transformed_text = tldextract.extract( found_urls[0]).domain elif self._urls != 'title': if self._urls == 'domain_unwrap': domain = unshorten_url( found_urls[0], None, self.params['print_url_warnings']) else: domain = unshorten_url( found_urls[0], URL_SHORTENERS, self.params['print_url_warnings']) self._domains[found_urls[0]] = domain tok._.transformed_text = domain elif self._urls == 'title': domain = unshorten_url(found_urls[0], URL_SHORTENERS) if domain != 'twitter': title = get_url_title( found_urls[0], self.params['print_url_warnings']) title = self.tokenize(URLS_RE.sub('', title)) else: title = '' tok._.transformed_text = title self._domains[found_urls[0]] = title def _replace_token(self, __, doc, i, matches): # Replace tokens with something else match_id, start, end = matches[i] span = doc[start:end] replacement_token = self._replacements[doc.vocab.strings[match_id]] for tok in span: tok._.transformed_text = replacement_token @staticmethod def _remove_token(__, doc, i, matches): # Remove tokens __, start, end = matches[i] span = doc[start:end] for tok in span: tok._.transformed_text = '' def _retokenize(self, __, doc, i, matches): # Retokenize __, start, end = matches[i] span = doc[start:end] for tok in span: text = tok.text text = re.sub(r'([#@])', r' \1', text) text = re.sub(r'\s{2,}', ' ', text).strip() tok._.transformed_text = self.tokenize(text) def _infer_spaces(self, text): # Infer location of spaces in hashtags text = text.lower() text = re.sub(r'[^\w\s]', '', text) def best_match(i): # Find the best match for the first i characters # assuming costs has been built for the first (i-1) characters candidates = enumerate(reversed(cost[max(0, i - self._maxword):i])) return min( (c + self._wordcost.get(text[i - k - 1:i], 9e999), k + 1) for k, c in candidates) cost = [0] for i in range(1, len(text) + 1): cur_cost, k = best_match(i) cost.append(cur_cost) out = [] i = len(text) while i > 0: cur_cost, k = best_match(i) assert cur_cost == cost[i] out.append(text[i - k:i]) i -= k return list(reversed(out)) def _handles_postprocess(self, __, doc, i, matches): # Process twitter handles __, start, end = matches[i] span = doc[start:end] for tok in span: if self.params['twitter_handles'] == 'realname': if tok.text in self._realnames: tok._.transformed_text = self._realnames[tok.text] else: handle = get_twitter_realname(tok.text) realname = self.tokenize(TWITTER_HANDLES_RE.sub( '', handle)) tok._.transformed_text = realname self._realnames[tok.text] = realname elif self.params['twitter_handles'] == 'split': poss = self._infer_spaces(tok._.transformed_text[1:]) if poss: tok._.transformed_text = poss else: tok._.transformed_text = self.params['twitter_handles'] def _hashtag_postprocess(self, __, doc, i, matches): # Process hashtags __, start, end = matches[i] span = doc[start:end] for tok in span: if self.params['hashtags'] == 'split': poss = self._infer_spaces(tok._.transformed_text[1:]) if poss: tok._.transformed_text = poss else: tok._.transformed_text = self.params['hashtags'] @staticmethod def _decontract(text): # Expand contractions for contraction, decontraction in DECONTRACTIONS.items(): text = re.sub(contraction, decontraction, text) return text def _preprocess_text(self, text): # Do some preprocessing text = re.sub("’", "'", text) if self.params['remove_nonunicode']: try: text = text.encode('utf-8').decode('unicode-escape') text = ''.join(filter(lambda x: x in string.printable, text)).strip() except UnicodeDecodeError: warnings.warn( '(UnicodeDecodeError while trying to remove non-unicode characters' ) if self.params['decontract']: text = self._decontract(text) text = html.unescape(text) if self.params['latin_chars_fix']: if EMOJIS_UTF_RE.findall(text): text = EMOJIS_UTF_NOSPACE_RE.sub(r' \1', text) for utf_code, emoji in EMOJIS_UTF.items(): text = EMOJIS_UTF_PATS[utf_code].sub(emoji, text) if EMOJIS_UNICODE_RE.findall(text): text = EMOJIS_UNICODE_NOSPACE_RE.sub(r'\1 \2', text) for utf_code, emoji in EMOJIS_UNICODE.items(): text = EMOJIS_UNICODE_PATS[utf_code].sub(emoji, text) if LATIN_CHARS_RE.findall(text): for _hex, _char in LATIN_CHARS.items(): text = LATIN_CHARS_PATS[_hex].sub(_char, text) if self.params['ignore_reddit_quotes']: text = REDDIT_QUOTES_RE.sub(text, ' ') text = text.replace('.@', '. @') text = re.sub(r'([*;,!?\(\)\[\]])', r' \1', text) text = re.sub(r'\s{2,}', ' ', text) return text.strip() def _merge_doc(self, doc): # Perform merging for certain types of tokens matches = self._merging_matcher(doc) spans = [] for __, start, end in matches: spans.append(doc[start:end]) for span in spans: span.merge() for tok in doc: tok._.transformed_text = tok.text return doc def _match_doc(self, doc): # Perform all additional processing self._matcher(doc) return doc def _postproc_doc(self, doc): # Perform postprocessing doc._.tokens = [] for tok in doc: if isinstance(tok._.transformed_text, list): doc._.tokens.extend(tok._.transformed_text) elif tok._.transformed_text.strip() != '': if self.params['whitespaces_to_underscores']: tok._.transformed_text = "_".join( tok._.transformed_text.split()) doc._.tokens.append(tok._.transformed_text.strip()) return doc def tokenize(self, text): """ Tokenize document Parameters ---------- text : str Document to tokenize Returns ------- list List of tokens Examples -------- >>> from redditscore.tokenizer import CrazyTokenizer >>> tokenizer = CrazyTokenizer(splithashtags=True, hashtags=False) >>> tokenizer.tokenize("#makeamericagreatagain") ["make", "america", "great", "again"] """ if not isinstance(text, str): warnings.warn('Document {} is not a string'.format(text)) return [] text = self._preprocess_text(text) doc = self._nlp(text) tokens = doc._.tokens if self.params['ngrams'] > 1: if self.params['whitespaces_to_underscores']: tokens = word_ngrams(tokens, (1, self.params['ngrams']), separator='_') else: tokens = word_ngrams(tokens, (1, self.params['ngrams'])) return tokens
with open('countries.json') as f: COUNTRIES = json.loads(f.read()) nlp = English() matcher = PhraseMatcher(nlp.vocab) matcher.add('COUNTRY', None, *list(nlp.pipe(COUNTRIES))) def countries_component(doc): #Create an entity Span with the label "GPE" for all matches matches = matcher(doc) doc.ents = [ Span(doc, start, end, label="GPE") for match_id, start, end in matches ] return doc #Add component to the pipeline nlp.add_pipe(countries_component) print(nlp.pipe_names) #Getter that looks up the span text in the dictionary of country capitals get_capital = lambda span: CAPITALS.get(span.text) #Register the Span extension attribute 'capital' with the getter get_capital Span.set_extension('capital', getter=get_capital) #Process the text and print the entity text, label and capital attributes doc = nlp("Czech Republic may help Slovakia protext its airspace") print([(ent.text, ent.label_, ent._.capital) for ent in doc.ents])
def getSentences(text): nlp = English() nlp.add_pipe(nlp.create_pipe('sentencizer')) document = nlp(text) return [sent.string.strip() for sent in document.sents]
def prepare_data(self): """ Create the data using the ``huggingface/nlp`` library. This function handles downloading, preprocessing, tokenization, and feature extraction. """ all_tokenized_files_present = all( os.path.isfile(path) for path in self.tokenized_data_file_paths.values()) if self.hparams.no_prepare_data or all_tokenized_files_present: logger.info( "Skipping data preparation because `--no_prepare_data` was specified or all the final tokenized data files are present." ) if self.hparams.only_preprocess: logger.info( "Exiting because both `--no_prepare_data` and `--only_preprocess` set." ) sys.exit(0) return def convert_to_features(example_batch): max_length = self.tokenizer.model_max_length articles = example_batch[self.hparams.data_example_column] articles_encoded_step = [] for idx, article in enumerate(articles): article = article.strip() try: article_encoded = self.tokenizer( article, padding="max_length", truncation=True, ) articles_encoded_step.append(article_encoded) except: # skipcq: FLK-E722 print("Failed to tokenize article: {}".format(article)) sys.exit(1) if idx != 0: current_length = len(article_encoded["input_ids"]) first_length = len(articles_encoded_step[0]["input_ids"]) assert ( current_length == first_length ), "The length of the current input, {}, does not match the length of the first input, {}.".format( current_length, first_length) articles_encoded = { "input_ids": [i["input_ids"] for i in articles_encoded_step], "attention_mask": [i["attention_mask"] for i in articles_encoded_step], } # articles_encoded = self.tokenizer.batch_encode_plus( # articles, pad_to_max_length=True, truncation=True, # ) highlights = example_batch[self.hparams.data_summarized_column] # Tokenize highlights using spacy to split them into sentences if they were not # already split in the dataset (use `hparams.split_char` to specify the sentence # boundary character) if not self.hparams.split_char: highlights = tokenize(spacy_nlp, highlights, disable_progress_bar=True) sep_token = self.tokenizer.sep_token highlights_input_ids = [] highlights_attention_masks = [] # For each ground-truth summary for highlight in highlights: if self.hparams.split_char: # simply split into sentences if `hparams.split_char` is specified sents = highlight.split(self.hparams.split_char) else: # `highlight` is a list of sentences where each sentence is a list of tokens # Combine those tokens to create a list of sentences. sents = [ " ".join(list_of_ids) for list_of_ids in highlight ] assert type(sents) is list assert len(sents) > 0 # Tokenize each sentence and append the `sep_token` sents_tokenized = [] for sent in sents: assert type(sent) is str assert len(sent) > 0 sent = self.tokenizer.tokenize(sent) sent.append(sep_token) sents_tokenized.append(sent) # Delete the last `sep_token` from the last sentence assert type(sents_tokenized[-1][-1]) is str del sents_tokenized[-1][-1] # Flatten `sents_tokenized` (a list of sentences where each sentence is a list # of tokens) to a list of tokens sents_tokenized_flat = list( itertools.chain.from_iterable(sents_tokenized)) assert type(sents_tokenized_flat[0]) is str assert len(sents_tokenized_flat) > 0 # Convert the tokens to `input_ids` # `max_length` is the max length minus 2 because we need to add the # beginning and ending tokens to the target sents_input_ids = self.tokenizer.encode_plus( sents_tokenized_flat, truncation=True, is_pretokenized=True, add_special_tokens=False, max_length=(max_length - 2), return_attention_mask=False, return_token_type_ids=False, )["input_ids"] # Insert beginning of sequence token and append end of sequence token. sents_input_ids.insert(0, self.target_boseq_token_id) sents_input_ids.append(self.target_eoseq_token_id) # Create attention mask attention_mask = [1] * len(sents_input_ids) # Append the `input_ids` and `attention_mask` highlights_input_ids.append(sents_input_ids) highlights_attention_masks.append(attention_mask) # Pad the highlight input ids and attention masks to `tokenizer.max_len`. # The articles have already been padded because they do not need the extra # `boseq` and `eoseq` tokens. highlights_input_ids = pad( highlights_input_ids, self.tokenizer.pad_token_id, width=max_length, ) highlights_attention_masks = pad(highlights_attention_masks, 0, width=max_length) return { "source": articles_encoded["input_ids"], "target": highlights_input_ids, "source_mask": articles_encoded["attention_mask"], "target_mask": highlights_attention_masks, } def remove_empty(batch_item): article = batch_item[self.hparams.data_example_column] article = article.strip() highlight = batch_item[self.hparams.data_summarized_column] highlight = highlight.strip() # keep_article = article and article != "\n" and article != "" # keep_highlight = highlight and highlight != "\n" and highlight != "" if self.hparams.use_percentage_of_data: keep_example = ( article and highlight and random.random() < self.hparams.use_percentage_of_data) else: keep_example = bool(article and highlight) return keep_example # Load spacy if the summary column does not contain separated sentences if not self.hparams.split_char: # load spacy english small model with the "tagger" and "ner" disabled since # we only need the "tokenizer" and "parser" # more info: https://spacy.io/usage/processing-pipelines if self.hparams.sentencizer: spacy_nlp = English() sentencizer = spacy_nlp.create_pipe("sentencizer") spacy_nlp.add_pipe(sentencizer) else: spacy_nlp = spacy.load("en_core_web_sm", disable=["tagger", "ner"]) # Combine the two sections of `scientific_papers` if it is chosen as the dataset if self.hparams.dataset == "scientific_papers": self.hparams.data_example_column = "article" self.hparams.data_summarized_column = "abstract" dataset_pubmed = nlp.load_dataset( "scientific_papers", "pubmed", cache_dir=self.hparams.nlp_cache_dir) dataset_arxiv = nlp.load_dataset( "scientific_papers", "arxiv", cache_dir=self.hparams.nlp_cache_dir) combined_dataset = {} for ( split, save_path_final_tokenized, ) in self.tokenized_data_file_paths.items(): save_path = os.path.join( self.hparams.cache_file_path, ("arxiv_pubmed_combined_" + split + ".arrow"), ) # If the file has not been saved to disk then combine arXiv and PubMed # and write to file. Don't process if the final tokenized version is # present and can be loaded. if (not os.path.exists(save_path)) and ( not os.path.exists(save_path_final_tokenized)): logger.info("Joining split %s", split) new = pyarrow.concat_tables([ dataset_pubmed[split].data, dataset_arxiv[split].data ]) writer = nlp.arrow_writer.ArrowWriter(path=save_path) writer.write_table(new) else: logger.info( "Skipping joining split %s because it already exists", split) if not os.path.exists(save_path_final_tokenized): # Load combined dataset from file if the final tokenized version # does not exist. logger.info("Loading split %s", save_path) combined_dataset[split] = nlp.Dataset.from_file(save_path) else: # If the tokenzed split already exists then just store the pubmed # section as a placeholder so `nlp` does not complain. logger.info( "NOT loading split %s because the final tokenized version already exists.", save_path, ) combined_dataset[split] = dataset_pubmed[split] self.dataset = combined_dataset else: if type(self.hparams.dataset ) is list and "/" in self.hparams.dataset[0]: for (split, _), dataset_path in zip( self.tokenized_data_file_paths.items(), self.hparams.dataset): self.dataset[split] = nlp.Dataset.from_file(dataset_path) else: self.dataset = nlp.load_dataset( self.hparams.dataset, self.hparams.dataset_version, cache_dir=self.hparams.nlp_cache_dir, ) for split, features_cache_file in self.tokenized_data_file_paths.items( ): # If the tokenized version has not been created yet, then do the initial # filtering so it can be created if not os.path.isfile(features_cache_file): logger.info("Removing empty examples from %s dataset", split) start_num_examples = len(self.dataset[split]) self.dataset[split] = self.dataset[split].filter( remove_empty, cache_file_name=os.path.join(self.hparams.cache_file_path, (split + "_filtered")), ) end_num_examples = len(self.dataset[split]) logger.info( "Removed %i (%.2f%%) examples from the dataset.", start_num_examples - end_num_examples, (1 - end_num_examples / start_num_examples) * 100, ) logger.info("Converting %s dataset to features", split) self.dataset[split] = self.dataset[split].map( convert_to_features, batched=True, remove_columns=self.dataset[split].data.column_names, cache_file_name=features_cache_file, ) # Exit if set to only preprocess the data if self.hparams.only_preprocess: logger.info( "Exiting because data has been pre-processed and the `--only_preprocess` option is enabled." ) sys.exit(0)
def get_sentencizer() -> spacy.language.Language: nlp = English() nlp.add_pipe(nlp.create_pipe('sentencizer')) return nlp
def transcribe( self, file_uri: Union[str, Path], phrases: Optional[List[str]] = None, **kwargs: Any, ) -> transcript_model.Transcript: """ Transcribe audio from GCS file and return a Transcript model. Parameters ---------- file_uri: Union[str, Path] The GCS file uri to the audio file or caption file to transcribe. It should be in format 'gs://...'. phrases: Optional[List[str]] = None A list of strings to feed as targets to the model. Returns ------- outputs: transcript_model.Transcript The transcript model for the supplied media file. """ # Create client client = speech.SpeechClient.from_service_account_file( filename=str(self.credentials_file)) # Create basic metadata metadata = speech.RecognitionMetadata() metadata.interaction_type = ( speech.RecognitionMetadata.InteractionType.PHONE_CALL) metadata.original_media_type = ( speech.RecognitionMetadata.OriginalMediaType.VIDEO) # Add phrases event_metadata_speech_context = speech.SpeechContext( phrases=self._clean_phrases(phrases)) # Prepare for transcription config = speech.RecognitionConfig( encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=16000, language_code="en-US", enable_automatic_punctuation=True, enable_word_time_offsets=True, enable_spoken_punctuation=True, speech_contexts=[ GOOGLE_SPEECH_ADAPTION_CLASSES, event_metadata_speech_context, ], metadata=metadata, model="video", use_enhanced=True, ) audio = speech.RecognitionAudio(uri=file_uri) # Begin transcription log.debug(f"Beginning transcription for: {file_uri}") operation = client.long_running_recognize(request={ "config": config, "audio": audio }) # Wait for complete response = operation.result(timeout=10800) # Select highest confidence transcripts confidence_sum = 0 segments = 0 # Create timestamped sentences timestamped_sentences: List[transcript_model.Sentence] = [] transcript_sentence_index = 0 # Create sentence boundary pipeline nlp = English() nlp.add_pipe("sentencizer") for result in response.results: # Some portions of audio may not have text if len(result.alternatives) > 0: # Split transcript into sentences doc = nlp(result.alternatives[0].transcript) # Convert generator to list sentences = [str(sent) for sent in doc.sents] # Index holder for word results of response w_marker = 0 for s_ind, _ in enumerate(sentences): # Sentence text s_text = sentences[s_ind] num_words = len(s_text.split()) # Initialize sentence model timestamped_sentence = transcript_model.Sentence( index=transcript_sentence_index, confidence=result.alternatives[0].confidence, # Start and end time are placeholder values start_time=0.0, end_time=0.0, words=[], text=s_text, ) for w_ind in range(w_marker, w_marker + num_words): # Extract word from response word = result.alternatives[0].words[w_ind] # Nanos no longer supported, use microseconds instead # https://github.com/googleapis/python-speech/issues/71 start_time = (word.start_time.seconds + word.start_time.microseconds * 1e-6) end_time = (word.end_time.seconds + word.end_time.microseconds * 1e-6) # Add start_time to Sentence if first word if w_ind - w_marker == 0: timestamped_sentence.start_time = start_time # Add end_time to Sentence if last word if (w_ind - w_marker) == (num_words - 1): timestamped_sentence.end_time = end_time # Create Word model timestamped_word = transcript_model.Word( index=w_ind - w_marker, start_time=start_time, end_time=end_time, text=self._clean_word(word.word), ) timestamped_sentence.words.append(timestamped_word) # Increment word marker w_marker += num_words # Add Sentence to sentence list timestamped_sentences.append(timestamped_sentence) # Increment transcript sentence index transcript_sentence_index += 1 # Update confidence stats confidence_sum += result.alternatives[0].confidence segments += 1 # Compute mean confidence if segments > 0: confidence = confidence_sum / segments else: confidence = 0.0 log.info( f"Completed transcription for: {file_uri}. Confidence: {confidence}" ) # Create transcript model transcript = transcript_model.Transcript( generator=f"Google Speech-to-Text -- CDP v{__version__}", confidence=confidence, session_datetime=None, created_datetime=datetime.utcnow().isoformat(), sentences=timestamped_sentences, ) return transcript
def test_issue7065_b(): # Test that the NEL doesn't crash when an entity crosses a sentence boundary nlp = English() vector_length = 3 nlp.add_pipe("sentencizer") text = "Mahler 's Symphony No. 8 was beautiful." entities = [(0, 6, "PERSON"), (10, 24, "WORK")] links = { (0, 6): { "Q7304": 1.0, "Q270853": 0.0 }, (10, 24): { "Q7304": 0.0, "Q270853": 1.0 }, } sent_starts = [1, -1, 0, 0, 0, 0, 0, 0, 0] doc = nlp(text) example = Example.from_dict(doc, { "entities": entities, "links": links, "sent_starts": sent_starts }) train_examples = [example] def create_kb(vocab): # create artificial KB mykb = KnowledgeBase(vocab, entity_vector_length=vector_length) mykb.add_entity(entity="Q270853", freq=12, entity_vector=[9, 1, -7]) mykb.add_alias( alias="No. 8", entities=["Q270853"], probabilities=[1.0], ) mykb.add_entity(entity="Q7304", freq=12, entity_vector=[6, -4, 3]) mykb.add_alias( alias="Mahler", entities=["Q7304"], probabilities=[1.0], ) return mykb # Create the Entity Linker component and add it to the pipeline entity_linker = nlp.add_pipe("entity_linker", last=True) entity_linker.set_kb(create_kb) # train the NEL pipe optimizer = nlp.initialize(get_examples=lambda: train_examples) for i in range(2): losses = {} nlp.update(train_examples, sgd=optimizer, losses=losses) # Add a custom rule-based component to mimick NER patterns = [ { "label": "PERSON", "pattern": [{ "LOWER": "mahler" }] }, { "label": "WORK", "pattern": [ { "LOWER": "symphony" }, { "LOWER": "no" }, { "LOWER": "." }, { "LOWER": "8" }, ], }, ] ruler = nlp.add_pipe("entity_ruler", before="entity_linker") ruler.add_patterns(patterns) # test the trained model - this should not throw E148 doc = nlp(text) assert doc
# sentence tokenization from spacy.lang.en import English # Load English tokenizer, tagger, parser, NER and word vectors nlp = English() # Create the pipeline 'sentencizer' component sbd = nlp.create_pipe('sentencizer') # Add the component to the pipeline nlp.add_pipe(sbd) text = """When learning Artificial Intelligence, you shouldn't get discouraged! Challenges and setbacks aren't failures, they're just part of the journey. You've got this!""" # "nlp" Object is used to create documents with linguistic annotations. doc = nlp(text) # create list of sentence tokens sents_list = [] for sent in doc.sents: sents_list.append(sent.text) print(sents_list)
def find_usage_examples_from_summary( self, form: Form = None, ) -> List[UsageExample]: """This tries to find and clean sentences and return the shortest one""" if form is None: raise ValueError("form was None") logger = logging.getLogger(__name__) # find sentences # order in a list by length # pick the shortest one where the form representation appears if self.language_code == WikimediaLanguageCode.ENGLISH: logger.info("using the English spaCy pipeline") nlp = English() nlp.add_pipe('sentencizer') elif self.language_code == WikimediaLanguageCode.SWEDISH: nlp = Swedish() nlp.add_pipe('sentencizer') elif (self.language_code == WikimediaLanguageCode.FRENCH or self.language_code == WikimediaLanguageCode.GERMAN or self.language_code == WikimediaLanguageCode.BOKMÅL or self.language_code == WikimediaLanguageCode.DANISH): logger.info( f"using the {self.language_code.name.title()} spaCy pipeline") try: nlp = spacy.load(f'{self.language_code.value}_core_news_sm') except: raise ModuleNotFoundError( f"Please install the spacy model for " f"{self.language_code.name.title()} by running: " f"'python -m spacy download " f"{self.language_code.value}_core_news_sm' " f"in the terminal/cmd/powershell") else: raise NotImplementedError( f"Sentence extraction for {self.language_code.name} " f"is not supported yet, feel free to open an issue at " f"https://github.com/dpriskorn/LexUtils/issues") doc = nlp(self.text) sentences = set() for sentence in doc.sents: # logger.info(sentence.text) # This is a very crude test for relevancy, we lower first to improve matching cleaned_sentence = sentence.text.lower() punctations = [".", ",", "!", "?", "„", "“", "»"] for punctation in punctations: if punctation in cleaned_sentence: cleaned_sentence = cleaned_sentence.replace( punctation, " ") cleaned_sentence = cleaned_sentence.strip() logger.debug(f"cleaned sentence:{cleaned_sentence}") if f" {form.representation.lower()} " in cleaned_sentence: # Add to the set first to avoid duplicates sentences.add(sentence.text) examples = [] for sentence in sentences: sentence_length = len(sentence.split(" ")) if (sentence_length > config.min_word_count and sentence_length < config.max_word_count): # Clean the sentence so it looks better punctations = ["„", "“", "»"] for punctation in punctations: if punctation in sentence: sentence = sentence.replace(punctation, " ") sentence = sentence.strip() examples.append(UsageExample(sentence=sentence, record=self)) # print("debug exit") # exit(0) return examples
def test_partial_links(): # Test that having some entities on the doc without gold links, doesn't crash TRAIN_DATA = [( "Russ Cochran his reprints include EC Comics.", { "links": { (0, 12): { "Q2146908": 1.0 } }, "entities": [(0, 12, "PERSON")], "sent_starts": [1, -1, 0, 0, 0, 0, 0, 0], }, )] nlp = English() vector_length = 3 train_examples = [] for text, annotation in TRAIN_DATA: doc = nlp(text) train_examples.append(Example.from_dict(doc, annotation)) def create_kb(vocab): # create artificial KB mykb = KnowledgeBase(vocab, entity_vector_length=vector_length) mykb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3]) mykb.add_alias("Russ Cochran", ["Q2146908"], [0.9]) return mykb # Create and train the Entity Linker entity_linker = nlp.add_pipe("entity_linker", last=True) entity_linker.set_kb(create_kb) optimizer = nlp.initialize(get_examples=lambda: train_examples) for i in range(2): losses = {} nlp.update(train_examples, sgd=optimizer, losses=losses) # adding additional components that are required for the entity_linker nlp.add_pipe("sentencizer", first=True) patterns = [ { "label": "PERSON", "pattern": [{ "LOWER": "russ" }, { "LOWER": "cochran" }] }, { "label": "ORG", "pattern": [{ "LOWER": "ec" }, { "LOWER": "comics" }] }, ] ruler = nlp.add_pipe("entity_ruler", before="entity_linker") ruler.add_patterns(patterns) # this will run the pipeline on the examples and shouldn't crash results = nlp.evaluate(train_examples) assert "PERSON" in results["ents_per_type"] assert "PERSON" in results["nel_f_per_type"] assert "ORG" in results["ents_per_type"] assert "ORG" not in results["nel_f_per_type"]
def main(): # get just the language with no model nlp = English() # nlp = spacy.load('en_core_web_sm') # add the sentencizer component to the pipeline # rem this component splits sentences on punctuation such as . ! ? # plugging it into pipeline to get just the sentence boundaries # without the dependency parse. sentencizer = nlp.create_pipe("sentencizer") nlp.add_pipe(sentencizer) ''' Model for component 'ner' not initialized. Did you forget to load a model, or forget to call begin_training()? ner = nlp.create_pipe("ner") nlp.add_pipe(ner) ''' # get product group data file and feed the info into arrays # that will be used later to create custom tags for the nlp object txt_obj = '' with open('../../store/model/erp10/pumps/prod_pumps_erp10.csv') as data: data = csv.reader(data, delimiter='|') headers = [] productIDs = [] products = [] suppliers = [] mpns = [] # TEST print ----------------------- #print('contents of arrays for tagging:\n') testList = [headers, productIDs, products, suppliers, mpns] i = 0 for row in data: if i == 0: headers.append(row) else: productID = row[0] product = row[1] supplier = row[2] mpn = row[3] productIDs.append(productID) products.append(product) suppliers.append(supplier) mpns.append(mpn) # create text object # rem add a period at the end so that the spacy sentencizer # knows how to detect the end of each record # and add all rows to text object except for header row if i != 0: txt_obj = txt_obj + ' '.join(row) + '.\n' i += 1 # TEST print ----------------------- print('testList items:\n') for item in testList: print(item) # clean the text object txt_obj = preprocessor.string_cleaner(txt_obj) # TEST PRINT ----------------------- print('\n\ntxt_obj after cleaning:\n', txt_obj) # create the nlp object: pumps_erp10 = nlp(txt_obj) # TEST print ----------------------- print('\n\npumps_erp10 after sentencizer:\n') for sent in pumps_erp10.sents: print(sent.text, '**end row**', end='') # TEST print ----------------------- print('\n\ntoken.like_num in nlp obj:\n') for token in pumps_erp10: print(token.like_num, ',', end='') # stuff we get: # token, .text, .i, .idx, .tag_, .lemma_ # .is_punct, .is_space, .like_num print('\nDone.') # stuff we don't get: # pos, ent, chunking, # LU # textcat (TextCategorizer, Doc.cats) # custom components (Doc._.xxx, Token._.xxx, Span._.xxx) # create_pipe, add_pipe # TEST print ----------------------- print('\n', nlp.pipeline) print('\n', nlp.pipe_names)
#!/usr/bin/env python # coding: utf-8 # ### Importing Dependencies import pandas as pd import spacy from spacy.pipeline import EntityRuler from spacy.lang.en import English ##Initiating Spacy english version nlp = English() ## Instantiating Entity ruler and adding to nlp via pipe ruler = EntityRuler(nlp) nlp.add_pipe(ruler) ##Creating Entity pattern ### To retrieve patterns from the text for NER class EntityPattern(): def dash_split(self, sentence): ##If pattern list has no value then split directly else loop the list and split each value if len(self.split_pattern_list) == 0: self.split_pattern_list = sentence.split('-') else: dash_list = [] for word in self.split_pattern_list: word_list = word.split('-') for split_word in word_list: dash_list.append(split_word)
class Segmentation: def __init__(self, dataset=None, entity_labels=None, no_rel_label=None, no_rel_multiple=False, sentence_align=False, test=False, same_entity_relation=False, write_Entites=False, generalize=False, parallelize=False, no_of_cores=64, predictions_folder=None, de_sample=None): """ Data files are read in and the sentence where the entitiy pair is located is segmented into 5 along with the labels and the track information (file number, entity1 and entity 2) that helps to write predictions back to file. :param dataset: path to dataset :param predictions_folder: path to predictions (output) folder :param entity_labels: labels of the list of entities that create the relations :param no_labels: name the label when entities that do not have relations in a sentence are considered :param no_rel_multiple: flag whether multiple labels are possibles for No-relation :param sentence_align: options to break sentences :param test: flag to run test-segmentation options :param same_entity_relation: flag when relation exists between same type of entities :param de_sample: flag to reduce the no of samples :param generalize: flag when relations are not dependent on the first given relation label :param parallelize: flag to parallelize the segmentation :param no_of_cores: no of cores to run the parallelized segmentation :param write_Entites: write entities and predictions to file :param with_labels: Take the labels of the entites into consideration during segmentation """ self.predictions_folder = predictions_folder self.dataset = dataset self.entity_labels = entity_labels self.test = test self.same_entity_relation = same_entity_relation self.generalize = generalize self.parallelize = parallelize self.write_Entites = write_Entites self.nlp_model = English() self.nlp_model.max_length = 2000000 if no_rel_label: self.no_rel_label = no_rel_label else: self.no_rel_label = False self.no_rel_multiple = no_rel_multiple if de_sample: self.de_sample = de_sample else: self.de_sample = False if sentence_align: sentencizer = Sentencizer(punct_chars=["\n"]) else: sentencizer = Sentencizer(punct_chars=["\n", ".", "?"]) if self.write_Entites and self.predictions_folder is not None: ext = ".ann" file.delete_all_files(predictions_folder, ext) self.nlp_model.add_pipe(sentencizer) # self.nlp_model = spacy.load('en_core_web_sm') # global segmentation object that returns all segments and the label self.segments = { 'seg_preceding': [], 'seg_concept1': [], 'seg_concept2': [], 'seg_concept1_label': [], 'seg_concept2_label': [], 'seg_middle': [], 'seg_succeeding': [], 'sentence': [], 'label': [], 'track': [] } #if parallelize flag is true if self.parallelize: # Pool object which offers a convenient means of parallelizing the execution of a function # across multiple input values, distributing the input data across processes pool = Pool(no_of_cores) all_args = [] for datafile, txt_path, ann_path in self.dataset: all_args.append([datafile, txt_path, ann_path]) segments_file = pool.map(self.process_file_parallel, all_args) pool.close() pool.join() # count = 0 # for i in range(len(segments_file)): # count = count + len(segments_file[i]['label']) # print(count) for segment in segments_file: # Add lists of segments to the segments object for the dataset self.segments['seg_preceding'].extend(segment['preceding']) self.segments['seg_concept1'].extend(segment['concept1']) self.segments['seg_middle'].extend(segment['middle']) self.segments['seg_concept2'].extend(segment['concept2']) self.segments['seg_succeeding'].extend(segment['succeeding']) self.segments['sentence'].extend(segment['sentence']) self.segments['track'].extend(segment['track']) # if not self.test: self.segments['label'].extend(segment['label']) # self.segments['seg_concept1_label'].extend(segment['concept1_label']) # self.segments['seg_concept2_label'].extend(segment['concept2_label']) else: segment = self.process_file_serial(dataset) # Add lists of segments to the segments object for the dataset self.segments['seg_preceding'].extend(segment['preceding']) self.segments['seg_concept1'].extend(segment['concept1']) self.segments['seg_middle'].extend(segment['middle']) self.segments['seg_concept2'].extend(segment['concept2']) self.segments['seg_succeeding'].extend(segment['succeeding']) self.segments['sentence'].extend(segment['sentence']) self.segments['track'].extend(segment['track']) # if not self.test: self.segments['label'].extend(segment['label']) self.segments['seg_concept1_label'].extend( segment['concept1_label']) self.segments['seg_concept2_label'].extend( segment['concept2_label']) if not self.test: # print(set(self.segments['label'])) # print the number of instances of each relation classes print([(i, self.segments['label'].count(i)) for i in set(self.segments['label'])]) # write the segments to a file file.list_to_file('sentence_test', self.segments['sentence']) file.list_to_file('preceding_seg', self.segments['seg_preceding']) file.list_to_file('concept1_seg', self.segments['seg_concept1']) file.list_to_file('middle_seg', self.segments['seg_middle']) file.list_to_file('concept2_seg', self.segments['seg_concept2']) file.list_to_file('succeeding_seg', self.segments['seg_succeeding']) file.list_to_file('track_test', self.segments['track']) # if not self.test: # file.list_to_file('labels_test', self.segments['label']) # file.list_to_file('concept1_seg_label', self.segments['seg_concept1_label']) # file.list_to_file('concept2_seg_label', self.segments['seg_concept2_label']) def process_file_parallel(self, dataset): """ Parallelizing the execution of segmentation across multiple input files, distributing the input data across processes :param dataset: dataset :return: segments """ self.file = dataset[0] self.ann_path = dataset[2] self.txt_path = dataset[1] self.ann_obj = Annotation(self.ann_path) print("File", self.file) content = open(self.txt_path).read() # content_text = normalization.replace_Punctuation(content) self.doc = self.nlp_model(content) file_name = str(self.file) + ".ann" if self.write_Entites and self.predictions_folder is not None: write_entities_to_file(self.ann_obj, file_name, self.predictions_folder) # else: # print("Define the path to the folder to save predictions ") segment = self.get_Segments_from_sentence(self.ann_obj) return segment def process_file_serial(self, dataset): """ Serial the execution of sementation across multiple input files, distributing the input data across processes :param dataset: dataset :return: segments """ for datafile, txt_path, ann_path in dataset: print("File", datafile) self.file = datafile self.ann_path = ann_path self.txt_path = txt_path self.ann_obj = Annotation(self.ann_path) content = open(self.txt_path).read() # content_text = normalization.replace_Punctuation(content) self.doc = self.nlp_model(content) file_name = str(datafile) + ".ann" if self.write_Entites and self.prediction_folder is not None: write_entities_to_file(self.ann_obj, file_name, self.prediction_folder) # else: # print("Define the path to the folder to save predictions ") segment = self.get_Segments_from_sentence(self.ann_obj) return segment def get_Segments_from_relations(self, ann): """ For each relation object, it identifies the label and the entities first, then extracts the span of the entities from the text file using the start and end character span of the entities. Then it finds the sentence the entities are located in and passes the sentence and the spans of the entities to the function that extracts the following segments: Preceding - (tokenize words before the first concept) concept 1 - (tokenize words in the first concept) Middle - (tokenize words between 2 concepts) concept 2 - (tokenize words in the second concept) Succeeding - (tokenize words after the second concept) :param ann: annotation object :return: segments and label """ # object to store the segments of a relation object segment = { 'preceding': [], 'concept1': [], 'concept2': [], 'middle': [], 'succeeding': [], 'sentence': [], 'label': [] } for label_rel, entity1, entity2 in ann.annotations['relations']: start_C1 = ann.annotations['entities'][entity1][1] end_C1 = ann.annotations['entities'][entity1][2] start_C2 = ann.annotations['entities'][entity2][1] end_C2 = ann.annotations['entities'][entity2][2] # to get arrange the entities in the order they are located in the sentence if start_C1 < start_C2: concept_1 = self.doc.char_span(start_C1, end_C1) concept_2 = self.doc.char_span(start_C2, end_C2) else: concept_1 = self.doc.char_span(start_C2, end_C2) concept_2 = self.doc.char_span(start_C1, end_C1) if concept_1 is not None and concept_2 is not None: # get the sentence where the entity is located sentence_C1 = str(concept_1.sent) sentence_C2 = str(concept_2.sent) else: break # if both entities are located in the same sentence return the sentence or # concatenate the individual sentences where the entities are located in to one sentence if sentence_C1 == sentence_C2: sentence = sentence_C1 else: sentence = sentence_C1 + " " + sentence_C2 sentence = normalization.remove_Punctuation(str(sentence).strip()) concept_1 = normalization.remove_Punctuation( str(concept_1).strip()) concept_2 = normalization.remove_Punctuation( str(concept_2).strip()) segment['concept1'].append(concept_1) segment['concept2'].append(concept_2) segment['sentence'].append(sentence.replace('\n', ' ')) preceding, middle, succeeding = extract_Segments( sentence, concept_1, concept_2) segment['preceding'].append(preceding.replace('\n', ' ')) segment['middle'].append(middle.replace('\n', ' ')) segment['succeeding'].append(succeeding.replace('\n', ' ')) segment['label'].append(label_rel) return segment def get_Segments_from_sentence(self, ann): """ In the annotation object, it identifies the sentence each problem entity is located and tries to determine the relations between other problem entities and other entity types in the same sentence. When a pair of entities is identified first it checks whether a annotated relation type exists, in that case it labels with the given annotated label if not it labels as a No - relation pair. finally it passes the sentence and the spans of the entities to the function that extracts the following segments: Preceding - (tokenize words before the first concept) concept 1 - (tokenize words in the first concept) Middle - (tokenize words between 2 concepts) concept 2 - (tokenize words in the second concept) Succeeding - (tokenize words after the second concept) :param ann: annotation object :return: segments and label: preceding, concept_1, middle, concept_2, succeeding, label """ # object to store the segments of a relation object for a file doc_segments = { 'preceding': [], 'concept1': [], 'concept2': [], 'concept1_label': [], 'concept2_label': [], 'middle': [], 'succeeding': [], 'sentence': [], 'label': [], 'track': [] } # list to store the identified relation pair when both entities are same self.entity_holder = [] for key1, value1 in ann.annotations['entities'].items(): label1, start1, end1, mention1 = value1 # when relations are dependent on one entity (dominant) if not self.generalize: # dominant label if label1 == self.entity_labels[0]: # if label1 == self.rel_labels[0] or label1 == self.rel_labels[1]: for key2, value2 in ann.annotations['entities'].items(): label2, start2, end2, mention2 = value2 token = True # if relation exists between the same entities if self.same_entity_relation and label2 == self.entity_labels[ 0] and key1 != key2: #needs checking if self.test: label_rel = "No Label" segment = self.extract_sentences( ann, key2, key1, label_rel) if segment is not None: doc_segments = add_file_segments( doc_segments, segment) else: for label_rel, entity1, entity2 in ann.annotations[ 'relations']: if key2 == entity1 and key1 == entity2: segment = self.extract_sentences( ann, entity1, entity2, label_rel, True) doc_segments = add_file_segments( doc_segments, segment) token = False break # No relations for the same entity if token and self.no_rel_label: if self.no_rel_multiple: label_rel = self.no_rel_label else: label_rel = self.no_rel_label[0] segment = self.extract_sentences( ann, key2, key1, label_rel) if segment is not None: doc_segments = add_file_segments( doc_segments, segment) # when the entity pair do not contain entities of the same type for i in range(len(self.entity_labels) - 1): # match the dominant entity with other entities if not self.same_entity_relation and label2 == self.entity_labels[ i + 1]: #label2 - second entity label if self.test: label_rel = "No Label" segment = self.extract_sentences( ann, key2, key1, label_rel) if segment is not None: doc_segments = add_file_segments( doc_segments, segment) else: # for the relations that exist in the ann files for label_rel, entity1, entity2 in ann.annotations[ 'relations']: # if key2 == entity2 and key1 == entity1: if key2 == entity1 and key1 == entity2: # when a match with an existing relation is found segment = self.extract_sentences( ann, entity1, entity2, label_rel, True) doc_segments = add_file_segments( doc_segments, segment) token = False break # No relations for the different entities if token and self.no_rel_label: if self.no_rel_multiple: label_rel = self.no_rel_label else: label_rel = self.no_rel_label[0] segment = self.extract_sentences( ann, key2, key1, label_rel) if segment is not None: doc_segments = add_file_segments( doc_segments, segment) else: # when relation exists between all entity pairs for key2, value2 in ann.annotations['entities'].items(): label2, start2, end2, mention2 = value2 token = True # for the same entity if self.same_entity_relation and label2 == self.entity_labels[ 0] and key1 != key2: if self.test: label_rel = "No Label" segment = self.extract_sentences( ann, key2, key1, label_rel) if segment is not None: doc_segments = add_file_segments( doc_segments, segment) else: #when relation exists in the ann file for label_rel, entity1, entity2 in ann.annotations[ 'relations']: if key2 == entity1 and key1 == entity2: segment = self.extract_sentences( ann, entity1, entity2, label_rel, True) doc_segments = add_file_segments( doc_segments, segment) token = False break # No relations for the same entity if token and self.no_rel_label: if self.no_rel_multiple: label_rel = self.no_rel_label else: label_rel = self.no_rel_label[0] segment = self.extract_sentences( ann, key2, key1, label_rel) if segment is not None: doc_segments = add_file_segments( doc_segments, segment) for i in range(len(self.entity_labels) - 1): # for the different entities if not self.same_entity_relation and label2 == self.entity_labels[ i + 1]: if self.test: label_rel = "No Label" segment = self.extract_sentences( ann, key2, key1, label_rel) if segment is not None: doc_segments = add_file_segments( doc_segments, segment) else: # when relation exists in the ann file for label_rel, entity1, entity2 in ann.annotations[ 'relations']: if key2 == entity1 and key1 == entity2: segment = self.extract_sentences( ann, entity1, entity2, label_rel, True) doc_segments = add_file_segments( doc_segments, segment) token = False break # No relations for the different entities if token and self.no_rel_label: if self.no_rel_multiple: label_rel = self.no_rel_label else: label_rel = self.no_rel_label[0] segment = self.extract_sentences( ann, key2, key1, label_rel) if segment is not None: doc_segments = add_file_segments( doc_segments, segment) return doc_segments def extract_sentences(self, ann, entity1, entity2, label_rel=None, join_sentences=False): """ when the two entities are give as input, it identifies the sentences they are located and determines whether the entity pair is in the same sentence or not. if not they combine the sentences if there an annotated relation exist and returns None if an annotated relation doesn't exist :param ann: annotation object :param label_rel: relation type :param entity1: first entity in the considered pair :param entity2: second entity in the considered pair :param join_sentences: check for annotated relation in the data :return: segments and sentences and label """ segment = { 'preceding': [], 'concept1': [], 'concept2': [], 'concept1_label': [], 'concept2_label': [], 'middle': [], 'succeeding': [], 'sentence': [], 'label': [], 'track': [] } start_C1 = ann.annotations['entities'][entity1][1] end_C1 = ann.annotations['entities'][entity1][2] start_C2 = ann.annotations['entities'][entity2][1] end_C2 = ann.annotations['entities'][entity2][2] label_C1 = ann.annotations['entities'][entity1][0] label_C2 = ann.annotations['entities'][entity2][0] # to get arrange the entities in the order they are located in the sentence if start_C1 < start_C2: concept_1 = self.doc.char_span(start_C1, end_C1) concept_2 = self.doc.char_span(start_C2, end_C2) else: concept_1 = self.doc.char_span(start_C2, end_C2) concept_2 = self.doc.char_span(start_C1, end_C1) if concept_1 is not None and concept_2 is not None: # get the sentence the entities are located sentence_C1 = str(concept_1.sent.text) sentence_C2 = str(concept_2.sent.text) # if both entities are located in the same sentence return the sentence or concatenate the individual sentences where the entities are located in to one sentence if join_sentences: if sentence_C1 == sentence_C2: sentence = sentence_C1 else: sentence = sentence_C1 + " " + sentence_C2 else: #if the entity pair considered do not come from an annotated relation, strictly restrict to one sentence if sentence_C1 == sentence_C2: sentence = sentence_C1 entity_pair = entity1 + '-' + entity2 # to make sure the same entity pair is not considered twice if entity_pair not in self.entity_holder: self.entity_holder.append(entity2 + '-' + entity1) else: sentence = None else: sentence = None else: sentence = None if sentence is not None: sentence = normalization.remove_Punctuation(str(sentence).strip()) concept_1 = normalization.remove_Punctuation( str(concept_1).strip()) concept_2 = normalization.remove_Punctuation( str(concept_2).strip()) preceding, middle, succeeding = extract_Segments( sentence, concept_1, concept_2) # remove the next line character in the extracted segment by replacing the '\n' with ' ' segment['concept1'].append(concept_1.replace('\n', ' ')) segment['concept2'].append(concept_2.replace('\n', ' ')) segment['sentence'].append(sentence.replace('\n', ' ')) segment['preceding'].append(preceding.replace('\n', ' ')) segment['middle'].append(middle.replace('\n', ' ')) segment['succeeding'].append(succeeding.replace('\n', ' ')) segment['label'].append(label_rel) # Adding the track information # print( int(self.file),int(entity1[1:]),int(entity2[1:])) segment['track'].append(int(self.file)) segment['track'].append(int(entity1[1:])) segment['track'].append(int(entity2[1:])) segment['concept1_label'].append(label_C1) segment['concept2_label'].append(label_C2) return segment