def __init__(self, dataset, cf): from model import CandidateFilteringModel from bert_serving.client import BertClient import jsonlines logger.info("Loading files...") #data_loader_train = dutils.load_obj_from_pkl_file('data loader (train)', cf.ASSET_FOLDER + '/data_loader_train.pkl') #data_loader_dev = dutils.load_obj_from_pkl_file('data loader (dev)', cf.ASSET_FOLDER + '/data_loader_dev.pkl') logger.info("Building model.") model = CandidateFilteringModel( embedding_dim=cf.EMBEDDING_DIM, hidden_dim=cf.HIDDEN_DIM, ) model.cuda() model.load_state_dict(torch.load(cf.BEST_MODEL_FILENAME)) model.eval() self.modelEvaluator = ModelEvaluator(model, None, None, None, cf) self.cf = cf # Initialise the coref pipeline for use in end-user evaluation self.nlp = spacy.load('en') self.coref = neuralcoref.NeuralCoref(self.nlp.vocab) self.nlp.add_pipe(self.coref, name='neuralcoref')
def make_nlp(coref_kwargs={}, lexicon_kwargs={}): # pylint: disable=dangerous-default-value nlp = spacy.load(PREPARE_PARAMETERS['spacy_model']) merge_ents = nlp.create_pipe("merge_entities") nlp.add_pipe(merge_ents, after="ner") nlp.add_pipe(spacy_utils.fix_names, after='merge_entities') nlp.add_pipe(spacy_utils.LazyWordnetAnnotator(nlp.lang)) nlp.add_pipe(proc_ent.EntityTypeHypernymMatcher()) coref = neuralcoref.NeuralCoref(nlp.vocab, blacklist=False, store_scores=False, **coref_kwargs) nlp.add_pipe(benchmark(coref), name='neuralcoref') em_lex = lexicons.load_nrc_emotions() lextag = tagging.LexiconTagger(nlp, em_lex, **lexicon_kwargs) nlp.add_pipe(lextag, name='tag_emotions') vad_lex = lexicons.load_nrc_vad() lextag = tagging.LexiconTagger(nlp, vad_lex, **lexicon_kwargs) nlp.add_pipe(lextag, name='tag_vad') negtag = tagging.NegTagger(nlp.vocab) nlp.add_pipe(negtag) semdep = sem.SemanticDepParser() nlp.add_pipe(semdep) return nlp
def coreference(self): nlp = spacy.load("en_core_web_sm") coref = neuralcoref.NeuralCoref(nlp.vocab) nlp.add_pipe(coref, name='neuralcoref') words = self.dealiased_text.split(' ') words_number = len(words) badge_size = 100000 if words_number > badge_size: if words_number % badge_size == 0: iterations = int(words_number / badge_size) else: iterations = int(words_number / badge_size) iterations += 1 new_text = "" for i in range(0, iterations): logging.info('Coreferencing part ' + str(i + 1) + ' of ' + str(iterations)) from_index = i * badge_size to_index = (i+1) * badge_size sub_text = ' '.join(words[from_index:to_index]) text_coreference = nlp(sub_text) # text = text_coreference._.coref_resolved new_text += self.custom_coref_resolved(text_coreference) else: new_text = self.dealiased_text self.dealiased_text = new_text
def initial_pipelines(greedyness=0.45, liwc_path='./LIWC_code_template/LIWC2015.csv', spacy_model='en_core_web_lg'): try: nlp_coref = spacy.load(spacy_model) # , disable=['tagger', 'ner']) except: raise Exception('spacy en_core_web_lg didnt download') coref = neuralcoref.NeuralCoref(nlp_coref.vocab, cfg={'greedyness': greedyness}) nlp_coref.add_pipe(coref, last=True, name='neuralcoref') # tr = pytextrank.TextRank(edge_weight=1.0, pos_kept=['ADJ', 'NOUN', 'PROPN', 'VERB'], token_lookback=3) # nlp_coref.add_pipe(tr.PipelineComponent, name='textrank', last=True) QA_boundary = '\t\n\t\n\t' sep = "\n\n\n\n" def Q_A_seg(doc, sep="\n\n\n\n", QA_boundary='\t\n\t\n\t'): length = len(doc) - 1 for index, token in enumerate(doc): if ((QA_boundary in token.text) or (sep in token.text)) and ( index < (length - 1) ): # The last word cannot be sent_Start. so this place should minus 1 further. doc[index + 1].sent_start = True elif (index < (length)): doc[index + 1].sent_start = False return doc parser = spacy.load(spacy_model) parser.add_pipe(Q_A_seg, before='parser', name='Q_A_seg') sentencizer = spacy.load(spacy_model) # coref.cfg LIWC_table = pd.read_csv(liwc_path, index_col=0) return nlp_coref, parser, sentencizer, LIWC_table
def perform_coref_anno(args): data_dir = os.path.join( args.data_dir, 'tc_processed' ) nlp = spacy.load('en_core_web_lg') coref = neuralcoref.NeuralCoref(nlp.vocab) nlp.add_pipe(coref, name='neuralcoref') splits = [ # 'train', # 'valid_freq', # 'valid_rare', 'test_freq', # 'test_rare' ] for split in splits: with open(os.path.join(data_dir, split + '_anno.json'), 'r') as data_file: split_data = json.load(data_file) annotated_split = coref_anno(nlp, split_data) with open(os.path.join(data_dir, split + '_anno_coref_large.json'), 'w') as annotated_file: json.dump(annotated_split, annotated_file)
def __init__(self): self.nlp = nlp self.coref = neuralcoref.NeuralCoref( self.nlp.vocab, greedyness=0.5) #, max_dist = 1000, max_dist_match = 1000) self.nlp.add_pipe(self.coref, name='neuralcoref') return
def __init__(self, argv): super().__init__(command=__file__, argv=argv) spacy.prefer_gpu() self.nlp = spacy.load('en_core_web_sm') coref = neuralcoref.NeuralCoref(self.nlp.vocab) self.nlp.add_pipe(coref, name='neuralcoref') self.__text_processor = TextProcessor(self.nlp, self._driver) self.create_constraints()
def __init__(self, data_path): super.__init__(data_path) # self.nlp = spacy.load("en_core_web_md") self.nlp.add_pipe(neuralcoref.NeuralCoref(self.nlp.vocab), name='neuralcoref') self.data = self.load_data(data_path) self.data = self.coref_resolve(self.data)
def initSpacy(self, modelSpacy, modelCoref): nlpSpacy = spacy.load(modelSpacy) nlpCoref = spacy.load('en') coref = neuralcoref.NeuralCoref(nlpCoref.vocab) nlpCoref.add_pipe(coref, name=modelCoref) return nlpCoref, nlpSpacy
def load_spacy_lang(lang='en_core_web_sm'): """Return a specific spaCy language model for the NLP module""" logger.info(f"Loading spaCy language model: '{lang}'") nlp = spacy.load(lang) logger.info("Done...") # Add neuralcoref pipe coref = neuralcoref.NeuralCoref(nlp.vocab, max_dist=200) nlp.add_pipe(coref, name='neuralcoref') return nlp
def apply_neuralcoref(texts: Iterable[str]) -> List[str]: import neuralcoref nlp = spacy.load('en_core_web_sm', parse=False, tag=False, entity=False) nlp.add_pipe(neuralcoref.NeuralCoref(nlp.vocab), name='neuralcoref') start_time = time.time() docs = nlp.pipe(texts) resolved = [doc._.coref_resolved for doc in docs] print("\nApplied neuralcoref on {} reviews.".format(len(texts))) print(time.time() - start_time) return resolved
def resolve_co_reference(text): ''' The coref model calculates the probabilities of links between The main occurence and a reference of that main occurence and on the basis of that replaces every reference with the main occurence it is referring to ''' coref = neuralcoref.NeuralCoref(nlp.vocab) # initialize the neuralcoref with spacy's vocabulary nlp.add_pipe(coref, name='neuralcoref') #add the coref model to pipe doc = nlp(text) if doc._.has_coref: ## if coreference is possible return doc._.coref_resolved ##return the sentence with all references replaced else: return text ##else return text as it is
def __init__(self, coreference=False): self.predictor = Predictor.from_path( "https://s3-us-west-2.amazonaws.com/allennlp/models/openie-model.2018-08-20.tar.gz" ) if torch.cuda.is_available(): self.predictor._model = self.predictor._model.cuda(0) self.spacy_pipeline = spacy.load('en') self.coreference = coreference if self.coreference: coref = neuralcoref.NeuralCoref(self.spacy_pipeline.vocab) self.spacy_pipeline.add_pipe(coref, name='neuralcoref')
def __init__(self, device): """ Initiates the base model. :param device: The device to move the model to. """ super().__init__(device) # Load a spacy model for tokenization. self.parser = spacy.load('en_core_web_sm') # TODO check what is the usage of this. coref = neuralcoref.NeuralCoref(self.parser.vocab) self.parser.add_pipe(coref, name='neuralcoref')
def init_coref(self): # if already instantiated, remove it. if COREF in self.nlp.pipe_names: self.nlp.remove_pipe(COREF) coref = neuralcoref.NeuralCoref( self.nlp.vocab, blacklist=self.blacklist, #conv_dict = conv_dict, max_dist=self.max_dist, max_dist_match=self.max_dist_match, greedyness=self.greed) if self.verbose: print("Added neuralcoref to pipeline!") self.nlp.add_pipe(coref, name='neuralcoref')
def get_spacy(): """ 加载指代消解模型,返回模型对象。 :return: nlp指代模型对象 """ # 加载spacy模型 logger.info("开始加载spacy模型。。。") # spacy加载参数 nlp = spacy.load('en_core_web_sm') # 加载字典,加载指代网络参数 coref = neuralcoref.NeuralCoref(nlp.vocab) # 构建管道,整合spacy和指代网络 nlp.add_pipe(coref, name='neuralcoref') logger.info("spacy模型加载完成!") return nlp
def __init__(self, dataset, cf): from model import E2EETModel from bert_serving.client import BertClient import jsonlines logger.info("Loading files...") data_loaders = dutils.load_obj_from_pkl_file( 'data loaders', cf.ASSET_FOLDER + '/data_loaders.pkl') # Note: the word and wordpiece vocab are stored as attributes so that they may be expanded # if necessary during evaluation (if a new word appears) self.word_vocab = dutils.load_obj_from_pkl_file( 'word vocab', cf.ASSET_FOLDER + '/word_vocab.pkl') self.wordpiece_vocab = dutils.load_obj_from_pkl_file( 'wordpiece vocab', cf.ASSET_FOLDER + '/wordpiece_vocab.pkl') hierarchy_tr = dutils.load_obj_from_pkl_file( 'hierarchy_tr', cf.ASSET_FOLDER + '/hierarchy_tr.pkl') hierarchy_et = dutils.load_obj_from_pkl_file( 'hierarchy_et', cf.ASSET_FOLDER + '/hierarchy_et.pkl') total_wordpieces = dutils.load_obj_from_pkl_file( 'total wordpieces', cf.ASSET_FOLDER + '/total_wordpieces.pkl') # Initialise the coref pipeline for use in end-user evaluation self.nlp = spacy.load('en') self.coref = neuralcoref.NeuralCoref(self.nlp.vocab) self.nlp.add_pipe(self.coref, name='neuralcoref') logger.info("Building model.") model = E2EETModel(embedding_dim=cf.EMBEDDING_DIM + cf.POSITIONAL_EMB_DIM, hidden_dim=cf.HIDDEN_DIM, vocab_size=len(self.wordpiece_vocab), label_size_tr=len(hierarchy_tr), label_size_et=len(hierarchy_et), total_wordpieces=total_wordpieces, max_seq_len=cf.MAX_SENT_LEN, batch_size=cf.BATCH_SIZE) model.cuda() model.load_state_dict(torch.load(cf.BEST_MODEL_FILENAME)) self.modelEvaluator = ModelEvaluator(model, None, self.word_vocab, self.wordpiece_vocab, hierarchy_tr, hierarchy_et, None, cf) self.cf = cf
def neural_coref_resolution(self): """ Perform coreference resolution operation on given text using neuralcoref. Supports domain specific coreference resolution as per the spacy model used. :return: - texts: list, List of sentences resolved and unsresolved by coreference resolution operation. """ coref = neuralcoref.NeuralCoref(self.nlp.vocab) self.nlp.add_pipe(coref, name='neuralcoref') texts = self.input_data() for index, text in enumerate(texts): doc = self.nlp(text) texts[index] = doc._.coref_resolved if self.coref_output is True: self.coref_output_file(texts) return texts
def __init__(self, model_size, with_sentiment=False): assert model_size in ["sm", "lg"] self.nlp = spacy.load(f"en_core_web_{model_size}") # "eng_core_web_lg" for better but slower results coref = neuralcoref.NeuralCoref(self.nlp.vocab, greedyness=0.4) self.nlp.add_pipe(coref, name='neuralcoref') self.nlp_pp = spacy.load(f"en_core_web_{model_size}") self.nlp_dbp = spacy.load(f"en_core_web_{model_size}", disable=["ner"]) initialize.load('en', self.nlp_dbp) self.nlp_nr = spacy.load(f"en_core_web_{model_size}") if with_sentiment: # Load target based sentiment self.tsa = target_based_sentiment.TargetSentimentAnalyzer() # Load general based sentiment self.gsa = general_sentiment.GeneralSentimentAnalyzer()
logger.addHandler(rotateHandler) logger.addHandler(stream) return logger def load_spacy_lang(lang='en_core_web_sm'): """Return a specific spaCy language model for the NLP module""" logger.info(f"Loading spaCy language model: '{lang}'") nlp = spacy.load(lang) logger.info("Done...") # Add neuralcoref pipe coref = neuralcoref.NeuralCoref(nlp.vocab, max_dist=200) nlp.add_pipe(coref, name='neuralcoref') return nlp logger = create_app_logger('userInputDashLogger') # Load spaCy Model print('Loading spaCy language model...') spacy_lang = spacy.load('en_core_web_lg') # Add custom named entity rules for non-standard person names that spaCy doesn't automatically identify ruler = EntityRuler( spacy_lang, overwrite_ents=True).from_disk('../NLP/main/rules/name_patterns.jsonl') spacy_lang.add_pipe(ruler) # Add neuralcoref pipe coref = neuralcoref.NeuralCoref(spacy_lang.vocab, max_dist=200) spacy_lang.add_pipe(coref, name='neuralcoref') print('Finished loading.') # Specify gender recognition service IP and port GENDER_RECOGNITION_SERVICE = 'http://{}:{}'.format('localhost', 5000)
# bword = b.split(" ")[-1] if getSim(a,b) >= 0.2: return True return False html = urllib.request.urlopen() soup = BeautifulSoup(html) data = soup.find("div", {"class": className}) paras = data.findAll("p") paras = [o.text for o in paras] nlp = spacy.load('en_core_web_lg') # load NeuralCoref and add it to the pipe of SpaCy's model coref = neuralcoref.NeuralCoref(nlp.vocab) nlp.add_pipe(coref, name='neuralcoref') paras = [nlp(para)._.coref_resolved for para in paras] # For SVO extraction: less accurate # allsvos = [] # for para in paras: # tokens = nlp(sent) # svos = findSVOs(tokens) # allsvos.extend(svos) testData = [] for para in paras:
def __init__(self): self.parser = spacy.load('en_core_web_sm') coref = neuralcoref.NeuralCoref(self.parser.vocab) self.parser.add_pipe(coref, name='neuralcoref')
def redcoat_to_sents(redcoat_data): print( "Converting document-level annotations into sentence-level annotations...", end="") data = [] nlp = spacy.load('en') coref = neuralcoref.NeuralCoref(nlp.vocab) nlp.add_pipe(coref, name='neuralcoref') final_training_data = [] non_annotated_docs = 0 for doc_idx, obj in enumerate(redcoat_data): orig_tokens = obj['tokens'] orig_mentions = obj['mentions'] if len(orig_mentions) == 0: non_annotated_docs += 1 continue doc = nlp(" ".join(obj['tokens'])) spacy_tokens = [str(w) for w in doc] # Build a mapping between the original tokens and the Spacy tokens, which are # tokenized differently. orig_to_spacy = {} i = 0 offset = 0 for i in range(len(orig_tokens)): spacy_token = str(doc[i + offset]) orig_token = orig_tokens[i] if not orig_token.startswith(spacy_token): while not orig_token.startswith(str(doc[i + offset])): offset += 1 #print(orig_token, str(doc[i]), str(doc[i+offset])) orig_to_spacy[i] = i + offset spacy_token = str(doc[i + offset]) #print(i, orig_token, spacy_token, offset, orig_to_spacy[i]) orig_to_spacy[i + 1] = i + 1 + offset # Add the last token # Construct a new document object using the spacy-tokenized data with updated label positions and coref. tagged_doc = [] for token in spacy_tokens: tagged_doc.append([token, []]) # Convert the format to a more usable data structure: # [word, labels] for mention in orig_mentions: start = mention['start'] end = mention['end'] labels = [l.split("/")[1] for l in mention['labels'] if "_" in l] spacy_start = orig_to_spacy[start] spacy_end = orig_to_spacy[end] for i in range(spacy_start, spacy_end): if not tagged_doc[i][ 0] == ".": # Remove labels from full stops, which are joined to words in Redcoat's tokenizer tagged_doc[i][1] = labels # Perform neural coreference resolution, copying the labels of the main cluster to the # tokens that refer back to it. offset = 0 if (doc._.has_coref): for c in doc._.coref_clusters: for m in c.mentions: #print(m, m.start, m.end, doc[m.start:m.end], m._.coref_cluster.main) orig_len = m.end - m.start main_coref = [str(w) for w in m._.coref_cluster.main] if len(main_coref) > 3: # Ignore long coreference mentions continue #print (main_coref, m._.coref_cluster.main.start) new_len = len(main_coref) labels = tagged_doc[m.start + offset][1] if ( len(labels) == 0 and len(tagged_doc) > m._.coref_cluster.main.start ): # Only swap the labels if this word does not already have labels labels = tagged_doc[m._.coref_cluster.main.start][1] main_coref_list = [[w, labels] for w in main_coref] tagged_doc = tagged_doc[:m.start + offset] + main_coref_list + tagged_doc[ m.end + offset:] offset += new_len - orig_len #print(m, main_coref_list) # Split up the object into sentences. sents = [] current_sent = [] for word, labels in tagged_doc: if word == "." and len(current_sent) > 0: sents.append(current_sent) current_sent = [] continue current_sent.append([word, labels]) if len(current_sent) > 0: sents.append(current_sent) # Fix the label indexes, i.e. head_3 in the second sentence should be # tag_1, and so on. aligned_sents = [] for s in sents: label_map = {} labels_seen = [{"head": 0, "rel": 0, "tail": 0} for x in range(10)] for word, labels in s: for label in labels: label_type, idx = label.split("_") labels_seen[int(idx) - 1][label_type] += 1 # Build a list of label indexes that are present as a head, rel, and tail in the sentence. complete_labels = [] for k, v in enumerate(labels_seen): if v["head"] > 0 and v["rel"] > 0 and v["tail"] > 0: complete_labels.append(k) aligned_sents.append([]) for word, labels in s: new_labels = [] for label in labels: label_type, idx = label.split("_") idx = int(idx) - 1 if idx in complete_labels: new_idx = complete_labels.index(idx) + 1 # new_labels.append(label_type) # for lx in range(1, new_idx + 1): # new_labels.append(label_type + ''.join([str(p) + '/' for p in range(1, lx)]) + "/" + str(lx)) new_labels.append(label_type) new_labels.append(label_type + "/" + str(new_idx)) aligned_sents[-1].append([word, new_labels]) # for s in sents: # for w in s: # print(w[0], w[1]) # print("======") # for s in aligned_sents: # for w in s: # print(w[0], w[1]) # exit() # Convert the sentences into the mention-level typing format. def tagged_sents_to_mentions(tagged_sents): mentions_data = [] for s in tagged_sents: tokens = [w[0] for w in s] mentions = [] current_labels = set() current_start = -1 labels_seen = set() for i, (word, labels) in enumerate(s): labels = [l for l in labels if l not in labels_seen] # if len(current_labels) == 0: if len(labels) > 0: current_labels = labels current_start = i elif set(labels) != set(current_labels): mentions.append({ 'start': current_start, 'end': i, 'labels': current_labels }) for l in current_labels: labels_seen.add(l) current_labels = labels current_start = i # Handle the last token correctly if i == (len(s) - 1) and len(current_labels) > 0: mentions.append({ 'start': current_start, 'end': i + 1, 'labels': current_labels }) break mentions_data.append({'tokens': tokens, 'mentions': mentions}) return mentions_data training_data = tagged_sents_to_mentions(aligned_sents) for doc in training_data: #print(":____") #print(doc) if len(doc['mentions']) > 0: final_training_data.append(doc) #for m in doc['mentions']: # print(doc['tokens'][m['start']:m['end']], m['labels']) print("\rParsing annotations... %s / %s (%d not annotated)" % (doc_idx, len(redcoat_data), non_annotated_docs), end="") print() return final_training_data
def __init__(self): coref = neuralcoref.NeuralCoref(nlp.vocab) nlp.add_pipe(coref, name='neuralcoref') logger.info("Model loaded")
from glob import glob import multiprocessing if sys.version_info[0] == 2: import cPickle as pickle else: import pickle #from spacy.lang.en import English from modules.graph_encoderABDUG4LS4V import NodePosition, Graph, EdgeType, get_edge_position import spacy nlp = spacy.load('en_core_web_lg') import neuralcoref coref = neuralcoref.NeuralCoref(nlp.vocab, greedyness=0.35) nlp.add_pipe(coref, name='neuralcoref') # nlp = English() # sentencizer = nlp.create_pipe("sentencizer") # nlp.add_pipe(sentencizer) logging.basicConfig( format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO) logger = logging.getLogger(__name__) class AnswerType(enum.IntEnum): """Type of NQ answer."""
dic_temp = {"document": args, "extraction": []} with open(args_file + ".json", 'w') as john: json.dump(dic_temp, john) #********************************************************************************************************** # PHASE 2 #text = "In 2017, Amazon acquired Whole Foods Market for US$13.4 billion, which vastly increased Amazon's presence as a brickand-mortar retailer." print() print("Working on BUY template") print() nlp = spacy.load('en') coref = neuralcoref.NeuralCoref(nlp.vocab, allow_outside_corefs=True) nlp.add_pipe(coref, name='neuralcoref') text = nlp(doc) #corefsss = [] # # # # print(" ".join(i.sent_ for i in text if i.text == 'acquired')) # # #displacy.serve(doc, style='dep') #sentences = [sent.string.strip() for sent in text.sents] #To classify the sentence according to template buy buy = [
{'quotes': {'$exists': True}}, {'lastModifier': 'quote_extractor'}, {'quotesUpdated': {'$exists': False}} ] doc_id_list = args['ids'] if args['ids'] else None outlet_list = args['outlets'] if args['outlets'] else None filters = { 'doc_id_list': doc_id_list, 'outlets': outlet_list, 'force_update': force_update, 'date_filters': date_filters, 'other_filters': other_filters } blocklist = utils.get_author_blocklist(AUTHOR_BLOCKLIST) print('Loading spaCy language model...') nlp = spacy.load('en_core_web_lg') # Add custom named entity rules for non-standard person names that spaCy doesn't automatically identify ruler = EntityRuler(nlp, overwrite_ents=True).from_disk(NAME_PATTERNS) nlp.add_pipe(ruler) print('Finished loading.') coref = neuralcoref.NeuralCoref(nlp.vocab, max_dist=200) nlp.add_pipe(coref, name='neuralcoref') run_pool(poolsize, chunksize) app_logger.info('Finished processing entities.')
def GenerateTitle(i): string = "" isfound = False doc = nlp(i) nlp.remove_pipe("neuralcoref") coref = neuralcoref.NeuralCoref(nlp.vocab) nlp.add_pipe(coref, name='neuralcoref') #remove stopwords and punctuations words = [ token.text for token in doc if token.is_stop != True and token.is_punct != True ] Nouns = [chunk.text for chunk in doc.noun_chunks] Adjectives = [token.lemma_ for token in doc if token.pos_ == "ADJ"] word_freq = Counter(words) word_freqNoun = Counter(Nouns) word_freqADJ = Counter(Adjectives) common_words = word_freq.most_common(5) common_wordsNoun = word_freqNoun.most_common(10) common_wordsADJ = word_freqADJ.most_common(10) maxcount = common_words[0][1] Range = min(len(common_wordsNoun), len(common_wordsADJ)) title2 = '' title1 = '' for j in range(Range): title2 = common_wordsADJ[j][0] + " " + common_wordsNoun[j][0] if title2 in i: # print("Adjective + Noun Title : ",title2) break for j in common_words: if j[1] == maxcount: string += j[0] + " " string = string[:-1] while not isfound: if string in i: isfound = True title1 = string # print("Title : ",title1) else: string = ' '.join(string.split(' ')[:-1]) title = [common_wordsNoun[0][0]] title = ' '.join(title) # print("Noun Title : ",title) #title - phrases #title1 - single word #title2 - adj + noun title if len(title) >= 5 and title != '': return title elif len(title1) >= 5 and title1 != '': return title1 elif len(title2) >= 5 and title2 != '': return title2 else: return ''
# Load your usual SpaCy model (one of SpaCy English models) import spacy ''' NOTE: the following code crashes or gives Seg fault with spacy version 2.1.4 Downgrade to spacy 2.1.3 by running the following on the command line: pip install -U spacy==2.1.3 ''' import neuralcoref #the MAIN import required : supplements spacy's built in coreference nlp = spacy.load( 'en' ) # load the model change this to en_web_core_sm if necessary choose appropriately coref = neuralcoref.NeuralCoref( nlp.vocab) # initialize the neuralcoref with spacy's vocabulary nlp.add_pipe(coref, name='neuralcoref') #add the coref model to pipe def resolve_co_reference(text): ''' The coref model calculates the probabilities of links between The main occurence and a reference of that main occurence and on the basis of that replaces every reference with the main occurence it is referring to ''' doc = nlp(text) if doc._.has_coref: ## if coreference is possible return doc._.coref_resolved ##return the sentence with all references replaced else: return text ##else return text as it is print( resolve_co_reference( 'Donald Trump is a bad president.Mr Trump has been a formidable candidate in the elections'
nltk.download('averaged_perceptron_tagger') nltk.download('universal_tagset') from nltk.corpus import stopwords set(stopwords.words('english')) from nltk.stem import WordNetLemmatizer from nltk.tokenize.treebank import TreebankWordDetokenizer detokenizer = TreebankWordDetokenizer() wnl = WordNetLemmatizer() # load NeuralCoref and add it to the pipe of SpaCy's model import spacy nlp = spacy.load('en') import neuralcoref coref = neuralcoref.NeuralCoref( nlp.vocab, greedyness=0.5, max_dist=50, blacklist=False, ) nlp.add_pipe(coref, name='neuralcoref') def process_tag(phrase, target): text = nltk.word_tokenize(phrase) posTagged = pos_tag(text) simplifiedTags = [(word, map_tag('en-ptb', 'universal', tag)) for word, tag in posTagged] res = '' for (word, tag) in simplifiedTags: if tag in target: res += word + ' ' return res.strip()