コード例 #1
0
    def __init__(self, dataset, cf):

        from model import CandidateFilteringModel
        from bert_serving.client import BertClient
        import jsonlines

        logger.info("Loading files...")

        #data_loader_train = dutils.load_obj_from_pkl_file('data loader (train)', cf.ASSET_FOLDER + '/data_loader_train.pkl')
        #data_loader_dev   = dutils.load_obj_from_pkl_file('data loader (dev)', cf.ASSET_FOLDER + '/data_loader_dev.pkl')

        logger.info("Building model.")
        model = CandidateFilteringModel(
            embedding_dim=cf.EMBEDDING_DIM,
            hidden_dim=cf.HIDDEN_DIM,
        )
        model.cuda()

        model.load_state_dict(torch.load(cf.BEST_MODEL_FILENAME))

        model.eval()

        self.modelEvaluator = ModelEvaluator(model, None, None, None, cf)
        self.cf = cf

        # Initialise the coref pipeline for use in end-user evaluation
        self.nlp = spacy.load('en')
        self.coref = neuralcoref.NeuralCoref(self.nlp.vocab)
        self.nlp.add_pipe(self.coref, name='neuralcoref')
コード例 #2
0
def make_nlp(coref_kwargs={}, lexicon_kwargs={}):  # pylint: disable=dangerous-default-value
    nlp = spacy.load(PREPARE_PARAMETERS['spacy_model'])

    merge_ents = nlp.create_pipe("merge_entities")
    nlp.add_pipe(merge_ents, after="ner")

    nlp.add_pipe(spacy_utils.fix_names, after='merge_entities')

    nlp.add_pipe(spacy_utils.LazyWordnetAnnotator(nlp.lang))
    nlp.add_pipe(proc_ent.EntityTypeHypernymMatcher())

    coref = neuralcoref.NeuralCoref(nlp.vocab,
                                    blacklist=False,
                                    store_scores=False,
                                    **coref_kwargs)
    nlp.add_pipe(benchmark(coref), name='neuralcoref')

    em_lex = lexicons.load_nrc_emotions()
    lextag = tagging.LexiconTagger(nlp, em_lex, **lexicon_kwargs)
    nlp.add_pipe(lextag, name='tag_emotions')

    vad_lex = lexicons.load_nrc_vad()
    lextag = tagging.LexiconTagger(nlp, vad_lex, **lexicon_kwargs)
    nlp.add_pipe(lextag, name='tag_vad')

    negtag = tagging.NegTagger(nlp.vocab)
    nlp.add_pipe(negtag)

    semdep = sem.SemanticDepParser()
    nlp.add_pipe(semdep)

    return nlp
コード例 #3
0
ファイル: novel.py プロジェクト: IDSIA/novel2graph
    def coreference(self):
        nlp = spacy.load("en_core_web_sm")
        coref = neuralcoref.NeuralCoref(nlp.vocab)
        nlp.add_pipe(coref, name='neuralcoref')
        words = self.dealiased_text.split(' ')
        words_number = len(words)
        badge_size = 100000
        if words_number > badge_size:
            if words_number % badge_size == 0:
                iterations = int(words_number / badge_size)
            else:
                iterations = int(words_number / badge_size)
                iterations += 1

            new_text = ""
            for i in range(0, iterations):
                logging.info('Coreferencing part ' + str(i + 1) + ' of ' + str(iterations))
                from_index = i * badge_size
                to_index = (i+1) * badge_size
                sub_text = ' '.join(words[from_index:to_index])

                text_coreference = nlp(sub_text)
                # text = text_coreference._.coref_resolved
                new_text += self.custom_coref_resolved(text_coreference)
        else:
            new_text = self.dealiased_text

        self.dealiased_text = new_text
コード例 #4
0
def initial_pipelines(greedyness=0.45,
                      liwc_path='./LIWC_code_template/LIWC2015.csv',
                      spacy_model='en_core_web_lg'):
    try:
        nlp_coref = spacy.load(spacy_model)  # , disable=['tagger', 'ner'])
    except:
        raise Exception('spacy en_core_web_lg didnt download')

    coref = neuralcoref.NeuralCoref(nlp_coref.vocab,
                                    cfg={'greedyness': greedyness})
    nlp_coref.add_pipe(coref, last=True, name='neuralcoref')
    # tr = pytextrank.TextRank(edge_weight=1.0, pos_kept=['ADJ', 'NOUN', 'PROPN', 'VERB'], token_lookback=3)
    # nlp_coref.add_pipe(tr.PipelineComponent, name='textrank', last=True)

    QA_boundary = '\t\n\t\n\t'
    sep = "\n\n\n\n"

    def Q_A_seg(doc, sep="\n\n\n\n", QA_boundary='\t\n\t\n\t'):
        length = len(doc) - 1
        for index, token in enumerate(doc):
            if ((QA_boundary in token.text) or (sep in token.text)) and (
                    index < (length - 1)
            ):  # The last word cannot be sent_Start. so this place should minus 1 further.
                doc[index + 1].sent_start = True
            elif (index < (length)):
                doc[index + 1].sent_start = False
        return doc

    parser = spacy.load(spacy_model)
    parser.add_pipe(Q_A_seg, before='parser', name='Q_A_seg')

    sentencizer = spacy.load(spacy_model)
    # coref.cfg
    LIWC_table = pd.read_csv(liwc_path, index_col=0)
    return nlp_coref, parser, sentencizer, LIWC_table
コード例 #5
0
def perform_coref_anno(args):
    data_dir = os.path.join(
        args.data_dir,
        'tc_processed'
    )
    nlp = spacy.load('en_core_web_lg')
    coref = neuralcoref.NeuralCoref(nlp.vocab)
    nlp.add_pipe(coref, name='neuralcoref')

    splits = [
        # 'train',
        # 'valid_freq',
        # 'valid_rare',
        'test_freq',
        # 'test_rare'
    ]

    for split in splits:
        with open(os.path.join(data_dir, split + '_anno.json'), 'r') as data_file:
            split_data = json.load(data_file)

        annotated_split = coref_anno(nlp, split_data)

        with open(os.path.join(data_dir, split + '_anno_coref_large.json'), 'w') as annotated_file:
            json.dump(annotated_split, annotated_file)
コード例 #6
0
    def __init__(self):
        self.nlp = nlp

        self.coref = neuralcoref.NeuralCoref(
            self.nlp.vocab,
            greedyness=0.5)  #, max_dist = 1000, max_dist_match = 1000)
        self.nlp.add_pipe(self.coref, name='neuralcoref')
        return
コード例 #7
0
 def __init__(self, argv):
     super().__init__(command=__file__, argv=argv)
     spacy.prefer_gpu()
     self.nlp = spacy.load('en_core_web_sm')
     coref = neuralcoref.NeuralCoref(self.nlp.vocab)
     self.nlp.add_pipe(coref, name='neuralcoref')
     self.__text_processor = TextProcessor(self.nlp, self._driver)
     self.create_constraints()
コード例 #8
0
    def __init__(self, data_path):
        super.__init__(data_path)
        # self.nlp = spacy.load("en_core_web_md")
        self.nlp.add_pipe(neuralcoref.NeuralCoref(self.nlp.vocab),
                          name='neuralcoref')

        self.data = self.load_data(data_path)
        self.data = self.coref_resolve(self.data)
コード例 #9
0
    def initSpacy(self, modelSpacy, modelCoref):
        nlpSpacy = spacy.load(modelSpacy)

        nlpCoref = spacy.load('en')
        coref = neuralcoref.NeuralCoref(nlpCoref.vocab)
        nlpCoref.add_pipe(coref, name=modelCoref)

        return nlpCoref, nlpSpacy
コード例 #10
0
ファイル: server.py プロジェクト: wjivan/GenderGapTracker
def load_spacy_lang(lang='en_core_web_sm'):
    """Return a specific spaCy language model for the NLP module"""
    logger.info(f"Loading spaCy language model: '{lang}'")
    nlp = spacy.load(lang)
    logger.info("Done...")
    # Add neuralcoref pipe
    coref = neuralcoref.NeuralCoref(nlp.vocab, max_dist=200)
    nlp.add_pipe(coref, name='neuralcoref')
    return nlp
コード例 #11
0
def apply_neuralcoref(texts: Iterable[str]) -> List[str]:
    import neuralcoref
    nlp = spacy.load('en_core_web_sm', parse=False, tag=False, entity=False)
    nlp.add_pipe(neuralcoref.NeuralCoref(nlp.vocab), name='neuralcoref')

    start_time = time.time()
    docs = nlp.pipe(texts)
    resolved = [doc._.coref_resolved for doc in docs]
    print("\nApplied neuralcoref on {} reviews.".format(len(texts)))
    print(time.time() - start_time)
    return resolved
コード例 #12
0
def resolve_co_reference(text):
	'''
	The coref model calculates the probabilities of links between The main occurence and a reference of that
	main occurence and on the basis of that replaces every reference with the main occurence it is referring to
	'''
	coref = neuralcoref.NeuralCoref(nlp.vocab) # initialize the neuralcoref with spacy's vocabulary
	nlp.add_pipe(coref, name='neuralcoref') #add the coref model to pipe
	doc = nlp(text)
	if doc._.has_coref: ## if coreference is possible
		return doc._.coref_resolved ##return the sentence with all references replaced
	else:
		return text ##else return text as it is 
コード例 #13
0
    def __init__(self, coreference=False):

        self.predictor = Predictor.from_path(
            "https://s3-us-west-2.amazonaws.com/allennlp/models/openie-model.2018-08-20.tar.gz"
        )
        if torch.cuda.is_available():
            self.predictor._model = self.predictor._model.cuda(0)

        self.spacy_pipeline = spacy.load('en')
        self.coreference = coreference
        if self.coreference:
            coref = neuralcoref.NeuralCoref(self.spacy_pipeline.vocab)
            self.spacy_pipeline.add_pipe(coref, name='neuralcoref')
コード例 #14
0
    def __init__(self, device):
        """
        Initiates the base model.
        :param device: The device to move the model to.
        """
        super().__init__(device)

        # Load a spacy model for tokenization.
        self.parser = spacy.load('en_core_web_sm')

        # TODO check what is the usage of this.
        coref = neuralcoref.NeuralCoref(self.parser.vocab)
        self.parser.add_pipe(coref, name='neuralcoref')
コード例 #15
0
ファイル: Neuralcoref.py プロジェクト: ph10m/ClEval
    def init_coref(self):
        # if already instantiated, remove it.
        if COREF in self.nlp.pipe_names:
            self.nlp.remove_pipe(COREF)
        coref = neuralcoref.NeuralCoref(
            self.nlp.vocab,
            blacklist=self.blacklist,
            #conv_dict = conv_dict,
            max_dist=self.max_dist,
            max_dist_match=self.max_dist_match,
            greedyness=self.greed)

        if self.verbose:
            print("Added neuralcoref to pipeline!")
        self.nlp.add_pipe(coref, name='neuralcoref')
コード例 #16
0
def get_spacy():
    """
    加载指代消解模型,返回模型对象。
    :return: nlp指代模型对象
    """
    # 加载spacy模型
    logger.info("开始加载spacy模型。。。")
    # spacy加载参数
    nlp = spacy.load('en_core_web_sm')
    # 加载字典,加载指代网络参数
    coref = neuralcoref.NeuralCoref(nlp.vocab)
    # 构建管道,整合spacy和指代网络
    nlp.add_pipe(coref, name='neuralcoref')
    logger.info("spacy模型加载完成!")

    return nlp
コード例 #17
0
    def __init__(self, dataset, cf):

        from model import E2EETModel
        from bert_serving.client import BertClient
        import jsonlines

        logger.info("Loading files...")

        data_loaders = dutils.load_obj_from_pkl_file(
            'data loaders', cf.ASSET_FOLDER + '/data_loaders.pkl')
        # Note: the word and wordpiece vocab are stored as attributes so that they may be expanded
        # if necessary during evaluation (if a new word appears)
        self.word_vocab = dutils.load_obj_from_pkl_file(
            'word vocab', cf.ASSET_FOLDER + '/word_vocab.pkl')
        self.wordpiece_vocab = dutils.load_obj_from_pkl_file(
            'wordpiece vocab', cf.ASSET_FOLDER + '/wordpiece_vocab.pkl')
        hierarchy_tr = dutils.load_obj_from_pkl_file(
            'hierarchy_tr', cf.ASSET_FOLDER + '/hierarchy_tr.pkl')
        hierarchy_et = dutils.load_obj_from_pkl_file(
            'hierarchy_et', cf.ASSET_FOLDER + '/hierarchy_et.pkl')
        total_wordpieces = dutils.load_obj_from_pkl_file(
            'total wordpieces', cf.ASSET_FOLDER + '/total_wordpieces.pkl')

        # Initialise the coref pipeline for use in end-user evaluation
        self.nlp = spacy.load('en')
        self.coref = neuralcoref.NeuralCoref(self.nlp.vocab)
        self.nlp.add_pipe(self.coref, name='neuralcoref')

        logger.info("Building model.")
        model = E2EETModel(embedding_dim=cf.EMBEDDING_DIM +
                           cf.POSITIONAL_EMB_DIM,
                           hidden_dim=cf.HIDDEN_DIM,
                           vocab_size=len(self.wordpiece_vocab),
                           label_size_tr=len(hierarchy_tr),
                           label_size_et=len(hierarchy_et),
                           total_wordpieces=total_wordpieces,
                           max_seq_len=cf.MAX_SENT_LEN,
                           batch_size=cf.BATCH_SIZE)
        model.cuda()

        model.load_state_dict(torch.load(cf.BEST_MODEL_FILENAME))

        self.modelEvaluator = ModelEvaluator(model, None, self.word_vocab,
                                             self.wordpiece_vocab,
                                             hierarchy_tr, hierarchy_et, None,
                                             cf)
        self.cf = cf
コード例 #18
0
    def neural_coref_resolution(self):
        """
        Perform coreference resolution operation on given text using neuralcoref.
        Supports domain specific coreference resolution as per the spacy model used.

        :return:
            - texts: list,
                List of sentences resolved and unsresolved by coreference resolution operation.
        """
        coref = neuralcoref.NeuralCoref(self.nlp.vocab)
        self.nlp.add_pipe(coref, name='neuralcoref')
        texts = self.input_data()
        for index, text in enumerate(texts):
            doc = self.nlp(text)
            texts[index] = doc._.coref_resolved
        if self.coref_output is True:
            self.coref_output_file(texts)
        return texts
コード例 #19
0
    def __init__(self, model_size, with_sentiment=False):
        assert model_size in ["sm", "lg"]
        self.nlp = spacy.load(f"en_core_web_{model_size}")  # "eng_core_web_lg" for better but slower results
        coref = neuralcoref.NeuralCoref(self.nlp.vocab, greedyness=0.4)
        self.nlp.add_pipe(coref, name='neuralcoref')


        self.nlp_pp = spacy.load(f"en_core_web_{model_size}")

        self.nlp_dbp = spacy.load(f"en_core_web_{model_size}", disable=["ner"])
        initialize.load('en', self.nlp_dbp)


        self.nlp_nr = spacy.load(f"en_core_web_{model_size}")

        if with_sentiment:
            # Load target based sentiment
            self.tsa = target_based_sentiment.TargetSentimentAnalyzer()  

            # Load general based sentiment
            self.gsa = general_sentiment.GeneralSentimentAnalyzer()
コード例 #20
0
ファイル: server.py プロジェクト: wjivan/GenderGapTracker
    logger.addHandler(rotateHandler)
    logger.addHandler(stream)
    return logger


def load_spacy_lang(lang='en_core_web_sm'):
    """Return a specific spaCy language model for the NLP module"""
    logger.info(f"Loading spaCy language model: '{lang}'")
    nlp = spacy.load(lang)
    logger.info("Done...")
    # Add neuralcoref pipe
    coref = neuralcoref.NeuralCoref(nlp.vocab, max_dist=200)
    nlp.add_pipe(coref, name='neuralcoref')
    return nlp


logger = create_app_logger('userInputDashLogger')
# Load spaCy Model
print('Loading spaCy language model...')
spacy_lang = spacy.load('en_core_web_lg')
# Add custom named entity rules for non-standard person names that spaCy doesn't automatically identify
ruler = EntityRuler(
    spacy_lang,
    overwrite_ents=True).from_disk('../NLP/main/rules/name_patterns.jsonl')
spacy_lang.add_pipe(ruler)
# Add neuralcoref pipe
coref = neuralcoref.NeuralCoref(spacy_lang.vocab, max_dist=200)
spacy_lang.add_pipe(coref, name='neuralcoref')
print('Finished loading.')
# Specify gender recognition service IP and port
GENDER_RECOGNITION_SERVICE = 'http://{}:{}'.format('localhost', 5000)
コード例 #21
0
#         bword = b.split(" ")[-1]
    
    if getSim(a,b) >= 0.2:
        return True
    return False
 
html = urllib.request.urlopen()
soup = BeautifulSoup(html)
data = soup.find("div", {"class": className})
paras = data.findAll("p")
paras = [o.text for o in paras]

nlp = spacy.load('en_core_web_lg')

# load NeuralCoref and add it to the pipe of SpaCy's model
coref = neuralcoref.NeuralCoref(nlp.vocab)
nlp.add_pipe(coref, name='neuralcoref')

paras = [nlp(para)._.coref_resolved for para in paras]


# For SVO extraction: less accurate
# allsvos = []
# for para in paras:
#     tokens = nlp(sent)
#     svos = findSVOs(tokens)
#     allsvos.extend(svos)


testData = []
for para in paras:
コード例 #22
0
 def __init__(self):
     self.parser = spacy.load('en_core_web_sm')
     coref = neuralcoref.NeuralCoref(self.parser.vocab)
     self.parser.add_pipe(coref, name='neuralcoref')
コード例 #23
0
def redcoat_to_sents(redcoat_data):
    print(
        "Converting document-level annotations into sentence-level annotations...",
        end="")
    data = []

    nlp = spacy.load('en')
    coref = neuralcoref.NeuralCoref(nlp.vocab)
    nlp.add_pipe(coref, name='neuralcoref')

    final_training_data = []
    non_annotated_docs = 0

    for doc_idx, obj in enumerate(redcoat_data):

        orig_tokens = obj['tokens']
        orig_mentions = obj['mentions']
        if len(orig_mentions) == 0:
            non_annotated_docs += 1
            continue

        doc = nlp(" ".join(obj['tokens']))

        spacy_tokens = [str(w) for w in doc]

        # Build a mapping between the original tokens and the Spacy tokens, which are
        # tokenized differently.
        orig_to_spacy = {}

        i = 0
        offset = 0
        for i in range(len(orig_tokens)):
            spacy_token = str(doc[i + offset])
            orig_token = orig_tokens[i]

            if not orig_token.startswith(spacy_token):
                while not orig_token.startswith(str(doc[i + offset])):
                    offset += 1
                    #print(orig_token, str(doc[i]), str(doc[i+offset]))

            orig_to_spacy[i] = i + offset
            spacy_token = str(doc[i + offset])
            #print(i, orig_token, spacy_token, offset, orig_to_spacy[i])
        orig_to_spacy[i + 1] = i + 1 + offset  # Add the last token

        # Construct a new document object using the spacy-tokenized data with updated label positions and coref.

        tagged_doc = []
        for token in spacy_tokens:
            tagged_doc.append([token, []])

        # Convert the format to a more usable data structure:
        # [word, labels]
        for mention in orig_mentions:
            start = mention['start']
            end = mention['end']
            labels = [l.split("/")[1] for l in mention['labels'] if "_" in l]

            spacy_start = orig_to_spacy[start]
            spacy_end = orig_to_spacy[end]
            for i in range(spacy_start, spacy_end):
                if not tagged_doc[i][
                        0] == ".":  # Remove labels from full stops, which are joined to words in Redcoat's tokenizer
                    tagged_doc[i][1] = labels

        # Perform neural coreference resolution, copying the labels of the main cluster to the
        # tokens that refer back to it.
        offset = 0
        if (doc._.has_coref):
            for c in doc._.coref_clusters:
                for m in c.mentions:

                    #print(m, m.start, m.end, doc[m.start:m.end], m._.coref_cluster.main)
                    orig_len = m.end - m.start
                    main_coref = [str(w) for w in m._.coref_cluster.main]

                    if len(main_coref) > 3:  # Ignore long coreference mentions
                        continue
                    #print (main_coref, m._.coref_cluster.main.start)
                    new_len = len(main_coref)

                    labels = tagged_doc[m.start + offset][1]
                    if (
                            len(labels) == 0
                            and len(tagged_doc) > m._.coref_cluster.main.start
                    ):  # Only swap the labels if this word does not already have labels
                        labels = tagged_doc[m._.coref_cluster.main.start][1]

                    main_coref_list = [[w, labels] for w in main_coref]

                    tagged_doc = tagged_doc[:m.start +
                                            offset] + main_coref_list + tagged_doc[
                                                m.end + offset:]
                    offset += new_len - orig_len
                    #print(m, main_coref_list)

        # Split up the object into sentences.
        sents = []
        current_sent = []
        for word, labels in tagged_doc:
            if word == "." and len(current_sent) > 0:
                sents.append(current_sent)
                current_sent = []
                continue
            current_sent.append([word, labels])
        if len(current_sent) > 0:
            sents.append(current_sent)

        # Fix the label indexes, i.e. head_3 in the second sentence should be
        # tag_1, and so on.
        aligned_sents = []
        for s in sents:
            label_map = {}
            labels_seen = [{"head": 0, "rel": 0, "tail": 0} for x in range(10)]
            for word, labels in s:
                for label in labels:
                    label_type, idx = label.split("_")
                    labels_seen[int(idx) - 1][label_type] += 1

            # Build a list of label indexes that are present as a head, rel, and tail in the sentence.
            complete_labels = []
            for k, v in enumerate(labels_seen):
                if v["head"] > 0 and v["rel"] > 0 and v["tail"] > 0:
                    complete_labels.append(k)

            aligned_sents.append([])
            for word, labels in s:
                new_labels = []
                for label in labels:
                    label_type, idx = label.split("_")
                    idx = int(idx) - 1
                    if idx in complete_labels:
                        new_idx = complete_labels.index(idx) + 1

                        # new_labels.append(label_type)
                        # for lx in range(1, new_idx + 1):
                        # 	new_labels.append(label_type + ''.join([str(p) + '/' for p in range(1, lx)]) + "/" + str(lx))

                        new_labels.append(label_type)
                        new_labels.append(label_type + "/" + str(new_idx))

                aligned_sents[-1].append([word, new_labels])

        # for s in sents:
        # 	for w in s:
        # 		print(w[0], w[1])
        # print("======")
        # for s in aligned_sents:
        # 	for w in s:
        # 		print(w[0], w[1])
        # exit()

        # Convert the sentences into the mention-level typing format.

        def tagged_sents_to_mentions(tagged_sents):
            mentions_data = []
            for s in tagged_sents:
                tokens = [w[0] for w in s]
                mentions = []
                current_labels = set()
                current_start = -1
                labels_seen = set()

                for i, (word, labels) in enumerate(s):

                    labels = [l for l in labels if l not in labels_seen]

                    #

                    if len(current_labels) == 0:
                        if len(labels) > 0:
                            current_labels = labels
                            current_start = i
                    elif set(labels) != set(current_labels):

                        mentions.append({
                            'start': current_start,
                            'end': i,
                            'labels': current_labels
                        })
                        for l in current_labels:
                            labels_seen.add(l)

                        current_labels = labels
                        current_start = i

                    # Handle the last token correctly
                    if i == (len(s) - 1) and len(current_labels) > 0:
                        mentions.append({
                            'start': current_start,
                            'end': i + 1,
                            'labels': current_labels
                        })
                        break

                mentions_data.append({'tokens': tokens, 'mentions': mentions})

            return mentions_data

        training_data = tagged_sents_to_mentions(aligned_sents)

        for doc in training_data:
            #print(":____")
            #print(doc)
            if len(doc['mentions']) > 0:
                final_training_data.append(doc)
            #for m in doc['mentions']:
            #	print(doc['tokens'][m['start']:m['end']], m['labels'])

        print("\rParsing annotations... %s / %s (%d not annotated)" %
              (doc_idx, len(redcoat_data), non_annotated_docs),
              end="")
    print()

    return final_training_data
コード例 #24
0
 def __init__(self):
     coref = neuralcoref.NeuralCoref(nlp.vocab)
     nlp.add_pipe(coref, name='neuralcoref')
     logger.info("Model loaded")
コード例 #25
0
from glob import glob
import multiprocessing

if sys.version_info[0] == 2:
    import cPickle as pickle
else:
    import pickle

#from spacy.lang.en import English
from modules.graph_encoderABDUG4LS4V import NodePosition, Graph, EdgeType, get_edge_position

import spacy
nlp = spacy.load('en_core_web_lg')

import neuralcoref
coref = neuralcoref.NeuralCoref(nlp.vocab, greedyness=0.35)
nlp.add_pipe(coref, name='neuralcoref')

# nlp = English()
# sentencizer = nlp.create_pipe("sentencizer")
# nlp.add_pipe(sentencizer)

logging.basicConfig(
    format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
    datefmt='%m/%d/%Y %H:%M:%S',
    level=logging.INFO)
logger = logging.getLogger(__name__)


class AnswerType(enum.IntEnum):
    """Type of NQ answer."""
コード例 #26
0

dic_temp = {"document": args, "extraction": []}

with open(args_file + ".json", 'w') as john:
    json.dump(dic_temp, john)

#**********************************************************************************************************
#                                    PHASE 2

#text = "In 2017, Amazon acquired Whole Foods Market for US$13.4 billion, which vastly increased Amazon's presence as a brickand-mortar retailer."
print()
print("Working on BUY template")
print()
nlp = spacy.load('en')
coref = neuralcoref.NeuralCoref(nlp.vocab, allow_outside_corefs=True)
nlp.add_pipe(coref, name='neuralcoref')
text = nlp(doc)

#corefsss = []
#
#
#
#    print(" ".join(i.sent_ for i in text if i.text == 'acquired'))
#
#
#displacy.serve(doc, style='dep')

#sentences = [sent.string.strip() for sent in text.sents]
#To classify the sentence according to template buy
buy = [
コード例 #27
0
            {'quotes': {'$exists': True}},
            {'lastModifier': 'quote_extractor'},
            {'quotesUpdated': {'$exists': False}}
        ]

    doc_id_list = args['ids'] if args['ids'] else None
    outlet_list = args['outlets'] if args['outlets'] else None

    filters = {
        'doc_id_list': doc_id_list,
        'outlets': outlet_list,
        'force_update': force_update,
        'date_filters': date_filters,
        'other_filters': other_filters
    }

    blocklist = utils.get_author_blocklist(AUTHOR_BLOCKLIST)

    print('Loading spaCy language model...')
    nlp = spacy.load('en_core_web_lg')
    # Add custom named entity rules for non-standard person names that spaCy doesn't automatically identify
    ruler = EntityRuler(nlp, overwrite_ents=True).from_disk(NAME_PATTERNS)
    nlp.add_pipe(ruler)
    print('Finished loading.')

    coref = neuralcoref.NeuralCoref(nlp.vocab, max_dist=200)
    nlp.add_pipe(coref, name='neuralcoref')

    run_pool(poolsize, chunksize)
    app_logger.info('Finished processing entities.')
    
コード例 #28
0
def GenerateTitle(i):
    string = ""
    isfound = False
    doc = nlp(i)
    nlp.remove_pipe("neuralcoref")
    coref = neuralcoref.NeuralCoref(nlp.vocab)
    nlp.add_pipe(coref, name='neuralcoref')

    #remove stopwords and punctuations
    words = [
        token.text for token in doc
        if token.is_stop != True and token.is_punct != True
    ]
    Nouns = [chunk.text for chunk in doc.noun_chunks]
    Adjectives = [token.lemma_ for token in doc if token.pos_ == "ADJ"]
    word_freq = Counter(words)
    word_freqNoun = Counter(Nouns)
    word_freqADJ = Counter(Adjectives)
    common_words = word_freq.most_common(5)
    common_wordsNoun = word_freqNoun.most_common(10)
    common_wordsADJ = word_freqADJ.most_common(10)
    maxcount = common_words[0][1]
    Range = min(len(common_wordsNoun), len(common_wordsADJ))
    title2 = ''
    title1 = ''
    for j in range(Range):
        title2 = common_wordsADJ[j][0] + " " + common_wordsNoun[j][0]
        if title2 in i:
            # print("Adjective + Noun Title : ",title2)
            break

    for j in common_words:
        if j[1] == maxcount:
            string += j[0] + " "
    string = string[:-1]
    while not isfound:
        if string in i:
            isfound = True
            title1 = string
            # print("Title : ",title1)
        else:
            string = ' '.join(string.split(' ')[:-1])
    title = [common_wordsNoun[0][0]]
    title = ' '.join(title)
    # print("Noun Title : ",title)

    #title - phrases
    #title1 - single word
    #title2 - adj + noun title

    if len(title) >= 5 and title != '':
        return title

    elif len(title1) >= 5 and title1 != '':
        return title1

    elif len(title2) >= 5 and title2 != '':
        return title2

    else:
        return ''
コード例 #29
0
# Load your usual SpaCy model (one of SpaCy English models)
import spacy
'''
NOTE: the following code crashes or gives Seg fault with spacy version 2.1.4
Downgrade to spacy 2.1.3 by running the following on the command line:
pip install -U spacy==2.1.3
'''
import neuralcoref  #the MAIN import required : supplements spacy's built in coreference
nlp = spacy.load(
    'en'
)  # load the model change this to en_web_core_sm if necessary choose appropriately
coref = neuralcoref.NeuralCoref(
    nlp.vocab)  # initialize the neuralcoref with spacy's vocabulary
nlp.add_pipe(coref, name='neuralcoref')  #add the coref model to pipe


def resolve_co_reference(text):
    '''
	The coref model calculates the probabilities of links between The main occurence and a reference of that
	main occurence and on the basis of that replaces every reference with the main occurence it is referring to
	'''
    doc = nlp(text)
    if doc._.has_coref:  ## if coreference is possible
        return doc._.coref_resolved  ##return the sentence with all references replaced
    else:
        return text  ##else return text as it is


print(
    resolve_co_reference(
        'Donald Trump is a bad president.Mr Trump has been a formidable candidate in the elections'
コード例 #30
0
nltk.download('averaged_perceptron_tagger')
nltk.download('universal_tagset')
from nltk.corpus import stopwords
set(stopwords.words('english'))
from nltk.stem import WordNetLemmatizer
from nltk.tokenize.treebank import TreebankWordDetokenizer
detokenizer = TreebankWordDetokenizer()
wnl = WordNetLemmatizer()

# load NeuralCoref and add it to the pipe of SpaCy's model
import spacy
nlp = spacy.load('en')
import neuralcoref
coref = neuralcoref.NeuralCoref(
    nlp.vocab,
    greedyness=0.5,
    max_dist=50,
    blacklist=False,
)
nlp.add_pipe(coref, name='neuralcoref')


def process_tag(phrase, target):
    text = nltk.word_tokenize(phrase)
    posTagged = pos_tag(text)
    simplifiedTags = [(word, map_tag('en-ptb', 'universal', tag))
                      for word, tag in posTagged]
    res = ''
    for (word, tag) in simplifiedTags:
        if tag in target:
            res += word + ' '
    return res.strip()