def __init__(self, text=None, nlp=None, id=0, _doc=None): """ :type text: str """ spacy.prefer_gpu() if isinstance(text, list): text = ' '.join([str(x) for x in text]) self._id = id if text is None and _doc is None: raise ValueError('Either text or _doc should be given!') elif text is None: self._doc = _doc self._text = _doc.text self._nlp = nlp elif _doc is None: self._doc = None self._text = str(text) self._nlp = nlp or spacy.load('en_core_web_sm') else: raise ValueError('Either text or _doc should be None!') self._doc = None self._tokens = None self._noun_chunks = None self._entity_chunks = None self._entities = None self._sentences = None self._sentences_method = None self._entity_graph = None self._syntax_graph = None
def parse_sentence(text): import spacy import re spacy.prefer_gpu() nlp = spacy.load('en') boundary = re.compile('^[0-9]$') def custom_seg(doc): prev = doc[0].text length = len(doc) for index, token in enumerate(doc): if (token.text == '.' and boundary.match(prev) and index!=(length - 1)): doc[index+1].sent_start = False prev = token.text return doc nlp.add_pipe(custom_seg, before='parser') doc = nlp(text) return_out = [] for sentence in doc.sents: return_out.append(parse_context(sentence.text)) return return_out
def tokenize_function(lemmatization=True, ngrams_length=2, workers=1): spacy_obj = spacy.load('en_core_web_sm') spacy.prefer_gpu() def tokenize(documents): # logging.info('Tokenizing {} documents...'.format(len(documents))) tokenized_documents = [] for i, doc in enumerate(tqdm(spacy_obj.pipe(documents, disable=["tagger", "parser", "ner"], n_threads=workers), desc='documents', total=len(documents))): tokens = [] for token in doc: if token.is_stop: token = '#' else: if lemmatization: token = token.lemma_ else: token = token.text token = remove_symbols(token) if token != '': tokens.append(token.lower()) ngrams = get_ngrams(tokens, max_length=ngrams_length) tokenized_documents.append(ngrams) return tokenized_documents return tokenize
def spacy_gpu_nlp(text: str = ''): # GPU Computation spacy.prefer_gpu() # Load English tokenizer, tagger, parser, NER and word vectors # nlp = spacy.load("en_core_web_sm") # Efficient - Good # nlp = spacy.load("en_core_web_md") # Blend - Will fail, need to fix test 2_EntityPhraseConcept nlp = spacy.load("en_core_web_lg") # Accurate - Good # Process whole document doc = nlp(text) # Analyze syntax of parts noun_phrases = [chunk.text for chunk in doc.noun_chunks] verbs = [token.lemma_ for token in doc if token.pos_ == "VERB"] ne_p_and_c = { '0_Verbs': [], '1_NounPhrases': [], '2_EntityPhraseConcept': [] } # Find named entities, phrases and concepts for entity in doc.ents: ne_p_and_c['2_EntityPhraseConcept'].append(entity.text) ne_p_and_c['1_NounPhrases'] = noun_phrases ne_p_and_c['0_Verbs'] = verbs # print('Returning from spacy_gpu_nlp') # pprint(ne_p_and_c) # print('Returning from spacy_gpu_nlp') return ne_p_and_c
def train_model(json_file_path, epochs=20): train_data = preprocess(json_file_path) spacy.prefer_gpu() if len(train_data)>=50: nlp = spacy.load(MODEL_PATH) other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner'] with nlp.disable_pipes(*other_pipes): # only train NER optimizer = nlp.resume_training() for itn in range(epochs): print("Starting iteration " + str(itn)) random.shuffle(train_data) losses = {} index = 0 batches = minibatch(train_data) for batch in batches: text, annotations = zip(*batch) try: nlp.update( text, # batch of texts annotations, # batch of annotations drop=0.25, # dropout - make it harder to memorise data sgd=optimizer, # callable to update weights losses=losses) except Exception as e: pass print(losses) nlp.to_disk(MODEL_PATH) return "Model trained and saved"
def __init__(self, params, spacy_size="md", gpu=True, viz=False, verbose=False): self.greed = get_param(params, "greed", greed) self.max_dist = get_param(params, "max_dist", max_dist) self.max_dist_match = get_param(params, "max_dist_match", max_dist_match) self.blacklist = get_param(params, "blacklist", blacklist) spacy_model = "en_core_web_{}".format(spacy_size) # DISABLED = ["ner"] # disable the ner module self.verbose = verbose if self.verbose: print("Loading spacy model...") self.nlp = spacy.load(spacy_model, disable=["ner"]) if gpu: spacy.prefer_gpu() if viz: self.viz = viz self.doc = None self.init_coref()
def __init__( self, text_field: str, doc_field: str, language: str = EN_CORE_WEB_SM, disable: Optional[List[str]] = None, exclude: Optional[List[str]] = None, pre: Optional[List[BasePreprocessor]] = None, memoize: bool = False, memoize_key: Optional[HashingFunction] = None, gpu: bool = False, ) -> None: name = type(self).__name__ super().__init__( name, field_names=dict(text=text_field), mapped_field_names=dict(doc=doc_field), pre=pre, memoize=memoize, memoize_key=memoize_key, ) self.gpu = gpu if self.gpu: spacy.prefer_gpu() self._nlp = spacy.load(language, disable=disable or [], exclude=exclude or [])
def get_tokenizer(tokenize_method: str, lang='en'): r""" :param str tokenize_method: 获取tokenzier方法 :param str lang: 语言,当前仅支持en :return: 返回tokenize函数 """ tokenizer_dict = { 'spacy': None, 'raw': _raw_split, 'cn-char': _cn_char_split, } if tokenize_method == 'spacy': import spacy spacy.prefer_gpu() if lang != 'en': raise RuntimeError("Spacy only supports en right right.") if parse_version(spacy.__version__) >= parse_version('3.0'): en = spacy.load('en_core_web_sm') else: en = spacy.load(lang) tokenizer = lambda x: [w.text for w in en.tokenizer(x)] elif tokenize_method in tokenizer_dict: tokenizer = tokenizer_dict[tokenize_method] else: raise RuntimeError(f"Only support {tokenizer_dict.keys()} tokenizer.") return tokenizer
def test_prefer_gpu(): try: import cupy # noqa: F401 prefer_gpu() assert isinstance(get_current_ops(), CupyOps) except ImportError: assert not prefer_gpu()
def normalization(inputData): data = list(dict.fromkeys(inputData)) spacy.prefer_gpu() nlp = spacy.load("ro_core_news_sm") result = list() for s in data: result.append(nlp(s)) return result
def __init__(self, argv): super().__init__(command=__file__, argv=argv) spacy.prefer_gpu() self.nlp = spacy.load('en_core_web_sm') coref = neuralcoref.NeuralCoref(self.nlp.vocab) self.nlp.add_pipe(coref, name='neuralcoref') self.__text_processor = TextProcessor(self.nlp, self._driver) self.create_constraints()
def load_spacy(): print('loading spacy...') spacy.prefer_gpu() pipeline = spacy.load('en_core_web_lg') sentencizer = Sentencizer() pipeline.add_pipe(sentencizer, first=True) print(pipeline.pipeline) # list of the above return pipeline
def __init__(self, argv): super().__init__(command=__file__, argv=argv) spacy.prefer_gpu() self.nlp = spacy.load('en_core_web_sm') #coref = neuralcoref.NeuralCoref(self.nlp.vocab) #self.nlp.add_pipe(coref, name='neuralcoref'); tr = pytextrank.TextRank() self.nlp.add_pipe(tr.PipelineComponent, name='textrank', last=True) self.__text_processor = TextProcessor(self.nlp, self._driver) self.create_constraints()
def loadSpacyDoc(self, text): spacy.prefer_gpu() lang = English() sentencizer = lang.create_pipe("sentencizer") spacyNlp = spacy.load("en_core_web_lg") spacyNlp.add_pipe(sentencizer, before="parser") # Create SpaCy document for sentencizing and lemmatizing spacyDoc = spacyNlp(text) return spacyDoc
def get_tokenizer(): try: import spacy spacy.prefer_gpu() en = spacy.load('en') print('use spacy tokenizer') return lambda x: [w.text for w in en.tokenizer(x)] except Exception as e: print('use raw tokenizer') return lambda x: x.split()
def get_document_embedding(text): """Generates the document embedding of a given report INPUT: Textual data OUTPUT: Text embedding""" spacy.prefer_gpu() nlp = spacy.load('en_core_sci_md') doc = nlp(text) embedding = doc.vector del nlp return embedding
def __init__(self, modelSpacy='en_core_web_lg', modelCoref='en'): print(os.path.dirname(spacy.__file__)) if ExtractInformation.IS_GPU: spacy.prefer_gpu() self.modelSpacy = modelSpacy self.modelCoref = modelCoref self.stanfordClient = StanfordOpenIE() self.nlpCoref, self.nlpSpacy = self.initSpacy(modelSpacy, modelCoref)
def load(self, path, prefer_gpu=False): """ Loads a spaCy model and sets it as new self.model. :param path: Path to directory of spaCy model. """ if prefer_gpu: spacy.prefer_gpu() nlp = spacy.load(path) self.model = nlp
def __init__(self, tweet_frequency=800): self.logpath = './log/io/csv/cleaner/' self.rpath = './data/csv/metadata.csv' self.logger() self.df = self.csv_to_dataframe() spacy.prefer_gpu() self.nlp = spacy.load('en') self.tweet_freq = tweet_frequency
def tokzenize(df): def helper(sent): tokens = [x.text for x in nlp.tokenizer(sent)] if len(tokens) > 300: return tokens[:300] return tokens spacy.prefer_gpu() nlp = spacy.load('en_core_web_sm') df['tokenized'] = df['reviewText'].apply(helper) return df
def __init__(self, spacy_model_name=None, cuda=-1): if cuda >= 0: spacy.prefer_gpu() if spacy_model_name is None: self.nlp = spacy.blank("en") # create blank Language class logging.info("Created blank 'en' model") else: self.nlp = spacy.load( spacy_model_name) # load existing spaCy model self.new = False logging.info("\nLoaded model '%s'", spacy_model_name)
def find_name(text): # use gpu - faster spacy.prefer_gpu() # comment out this line if GPU is not available # load English SpaCy model nlp = spacy.load("en_core_web_sm") # get encodings from text doc = nlp(text) # return person category return [ee for ee in doc.ents if ee.label_ == 'PERSON']
def setupModel(self): """ Setup the NLP models for computing similarity. """ try: # prefer to use GPU if available: spacy.prefer_gpu() spacymodel = self.app_config.checkAndSanitizeConfigString( 'plugins', 'mod_dedupe_spacymodel') self.nlpModel = spacy.load(spacymodel) except Exception as e: logger.error("Error loading the NLP model for de-dupe: %s", e)
def convert_to_vector_representation(data): spacy.prefer_gpu() nlp = spacy.load("en_core_web_lg") vectors = [] for document, star in tqdm(data): #document = nlp(document) document_vectors = [] for i in document: i = nlp(i) document_vectors.append(i.vector) vectors.append((torch.tensor(document_vectors), star)) return vectors
def dependency_parse(filepath): spacy.prefer_gpu() nlp = spacy.load("en_core_web_lg") dirpath = os.path.dirname(filepath) filepre = os.path.splitext(os.path.basename(filepath))[0] tokpath = os.path.join(dirpath, filepre + '.toks') parentpath = os.path.join(dirpath, filepre + '.parents') relpath = os.path.join(dirpath, filepre + '.rels') pospath = os.path.join(dirpath, filepre + '.pos') tagpath = os.path.join(dirpath, filepre + '.tag') lenpath = os.path.join(dirpath, filepre + '.len') with open(tokpath, 'w', encoding='utf-8') as tokfile, \ open(relpath, 'w', encoding='utf-8') as relfile, \ open(parentpath, 'w', encoding='utf-8') as parfile, \ open(lenpath, 'w', encoding='utf-8') as lenfile, \ open(tagpath, 'w', encoding='utf-8') as tagfile, \ open(pospath, 'w', encoding='utf-8') as posfile: with open(os.path.join(dirpath, 'a.txt'), 'r', encoding='utf-8') as f: for line in f: l = line.split(' ') l = [i for i in l if i != ''] newline = ' '.join(l) doc = nlp(newline) json_doc = doc.to_json() token = json_doc['tokens'] pos = [] tag = [] dep = [] tok = [] parent = [] length = json_doc['sents'][0]['end'] + 1 for t in token: if t['pos'] != 'SPACE': tok.append(doc[t['id']].text) pos.append(t['pos']) tag.append(t['tag']) dep.append(t['dep']) head = t['head'] if t['dep'] == 'ROOT': head = 0 else: head = head + 1 parent.append(head) tokfile.write(' '.join(tok) + '\n') posfile.write(' '.join(pos) + '\n') tagfile.write(' '.join(tag) + '\n') relfile.write(' '.join(dep) + '\n') parfile.writelines(["%s " % str(item) for item in parent]) parfile.write('\n') lenfile.write(str(length) + '\n')
def __init__(self, lang: str = "en_core_web_sm", nlp: spacy.language.Language = None, neuralcoref: bool = False, device: str = None, *args, **kwargs): # Set all the parameters self.lang = lang self.neuralcoref = neuralcoref self._prebuilt = True # Set the device self._on_gpu = False if device and (device == "gpu" or device.startswith("cuda")): spacy.prefer_gpu( gpu_id=0 if ":" not in device else int(device.split(":")[1])) # Spacy sets the default torch float Tensor to torch.cuda.FloatTensor, # which causes other GPU cachedops to crash. torch.set_default_tensor_type("torch.FloatTensor") self._on_gpu = True # Load up the Spacy module self._nlp = nlp if not nlp: self._nlp = self._load_spacy(lang=lang) self._prebuilt = False # Add neuralcoref self._add_neuralcoref() if not nlp: super(Spacy, self).__init__( lang=lang, neuralcoref=neuralcoref, *args, **kwargs, ) else: super(Spacy, self).__init__( lang=nlp.lang, # No need to pass in neuralcoref separately, it's already in the # pipeline if neuralcoref=True pipeline=nlp.pipe_names, *args, **kwargs, ) print( "Warning: Spacy.encode does not support arbitrary nlp pipelines so " "information stored in the Doc object may be lost in encoding." )
def transform(self, X): spacy.prefer_gpu() nlp = spacy.load("en_core_web_lg") new = [] for sentence in X: doc = nlp(sentence) json_doc = doc.to_json() token = json_doc['tokens'] tag = [] for t in token: tag.append(t['tag']) new.append(" ".join(tag)) return new
def __init__(self, limit=10000, enable_gpu=False): self.limit = limit self.library = None self.texts = collections.OrderedDict() if enable_gpu: spacy.prefer_gpu() self.nlp = spacy.load("en_core_web_sm") self.nlp.max_length = self.limit self.texts_pos = collections.OrderedDict() self.texts_stats = collections.OrderedDict() self.texts_dists = collections.OrderedDict()
def parse_context(text): import spacy spacy.prefer_gpu() nlp = spacy.load('en_core_web_sm') doc = nlp(text) #return_out = [text] return_out = [] for token in doc: return_out.append(token.pos_) return return_out
def __init__(self, subreddit=None, min_score=4, min_len=8, min_sent_len=6, max_sent_len=32, author=None, before=None, size=32, max_retries=5, counter_file="vocab_counter.pkl", collectionp_file="collection.pkl", sentence_file="sentences.txt", spacy_use=False): self.subreddit = subreddit self.min_score = min_score self.min_len = min_len self.min_sent_len = min_sent_len self.max_sent_len = max_sent_len self.author = author self.before = before self.size = size self.max_retries = max_retries # a praw.ini file is needed in the directory for this scraper to work self.reddit = praw.Reddit("scraper1") if spacy_use: spacy.prefer_gpu() self.nlp = spacy.load("en") else: download('punkt') self.spacy_use = spacy_use self.counter = collections.Counter() self.commentids = [] self.after = None self.collectionp_file = collectionp_file self.counter_file = counter_file self.sentence_file = sentence_file self.sent_count = 0 if os.path.isfile(self.collectionp_file): self.collectionp = pickle.load(open(self.collectionp_file, "rb")) self.commentids = self.collectionp["commentids"] self.after = self.collectionp["after"] self.sent_count = self.collectionp["sentence_count"] if os.path.isfile(self.counter_file): self.counter = pickle.load(open(self.counter_file, "rb"))
def test_prefer_gpu(): assert not prefer_gpu()