Esempio n. 1
0
class StanfordParser(object):
    def __init__(self, nlp=None, annots=None, props=None):
        if annots is None:
            annots = "tokenize pos lemma depparse"

        if nlp is None:
            self.nlp_client = CoreNLPClient(annotators=annots,
                                            output_format='json')
        else:
            self.nlp_client = nlp

        if props is not None:
            self.nlp_client.default_properties.update(props)

        _ = self.nlp_client.annotate("Let's get this party started!")
        del (_)

    def get_parse(self, sentence):
        return self.nlp_client.annotate(sentence)

    def get_deps(self, sentence, deptype='basicDependencies', ret='asis'):
        if isinstance(sentence, str):
            sentence = self.get_parse(sentence)['sentences'][0]

        deps = sentence[deptype]

        if ret == 'asis':
            retval = deps
        else:
            retval = {}
            retval['deps'] = {x['dep']: x['dependent'] for x in deps}
            retval['heads'] = {
                x['dependentGloss']: x['governorGloss']
                for x in deps
            }
            retval['governors'] = {
                x['dependent']: x['governorGloss']
                for x in deps
            }
            retval['dependents'] = {
                x['dependent']: x['dependentGloss']
                for x in deps
            }
            retval['text'] = [
                "{}({}-{}, {}-{})".format(x['dep'], x['governorGloss'],
                                          x['governor'], x['dependentGloss'],
                                          x['dependent']) for x in deps
            ]
        return retval
Esempio n. 2
0
class StanfordService:
    def __init__(self, parser_path: str):
        os.environ['JAVANLP_HOME'] = parser_path
        print(
            'starting CoreNLP server with JAVANLP_HOME {}'.format(parser_path))
        self.nlp = CoreNLPClient(annotators="tokenize ssplit".split(),
                                 timeout=1000000)

    def tokenize(self, text: str) -> List[Token]:
        for _ in range(10):
            try:
                annotated_result = self.nlp.annotate(text)
                stanford_document = StanfordDocument.from_proto(
                    annotated_result)
                return StanfordService.idiomatic_tokens(stanford_document)
            except:
                print('exception while annotating result')
                sleep(10)

    @staticmethod
    def idiomatic_tokens(doc: StanfordDocument):
        stanford_tokens = [
            token for sentence in doc.sentences for token in sentence.tokens
        ]
        return [
            StanfordService.idiomatic_token(token, index)
            for index, token in enumerate(stanford_tokens)
        ]

    @staticmethod
    def idiomatic_token(token: StanfordToken, token_index: int) -> Token:
        return Token(token.originalText, token_index,
                     token.characterOffsetBegin)
Esempio n. 3
0
class StanfordCoreferenceResolution:
    """
    Stanford CoreNLP co-reference.

    Parameters
    ----------
    timeout : int
        The timeout for the parser
        Defaults to 30000
    memory : str
        The memory allocation.
        Defaults to '6G'

    """
    def __init__(self, timeout=30000, memory='6G'):

        self.detok = TreebankWordDetokenizer()

        self.client = CoreNLPClient(
            annotators=['tokenize', 'ssplit', 'dcoref'],
            output_format='json',
            timeout=timeout,
            memory=memory)

    def resolve(self, doc, raise_errors=True):
        """
        Resolve the co-references for a single document.

        Parameters
        ----------
        doc : str
            A document whose co-references will be resolved.
        raise_errors : bool, optional
            Whether to raise errors.
            Defaults to True.

        Returns
        -------
        resolve_doc : str or None
            A document whose co-references have been resolved.
            If there was a problem and `raise_errors=False`,
            then `None` will be returned.
        """
        try:
            parsed = self.client.annotate(doc)
        except Exception as error:
            if raise_errors:
                raise error
            return
        return self.replace_coreferences(parsed)

    def resolve_all(self, docs, raise_errors=True):
        """
        Resolve co-references for all the documents.

        Parameters
        ----------
        docs : list of str
            A list of documents
        raise_errors : bool, optional
            Whether to raise errors.
            Defaults to False.

        Returns
        -------
        resolved_docs : list of str
            A list of documents, with co-references resolved.
        """
        resolved_docs = []
        for doc in tqdm(docs):
            resolved_docs.append(self.resolve(doc, raise_errors))
        return resolved_docs

    @staticmethod
    def restructure_coreference_dict(corefs_dict):
        """
        Given a dictionary of co-references, restructure it into
        a new dictionary where the keys are sentence numbers
        and the values are lists of references that need to
        be resolved.

        Parameters
        ----------
        corefs_dict : dict
            A co-reference dictionary, output from Stanford.
        """
        corefs_list = [
            corefs_dict[key] for key in corefs_dict
            if len(corefs_dict[key]) > 1 and any(
                not co['isRepresentativeMention'] for co in corefs_dict[key])
        ]

        corefs_dict = defaultdict(list)
        for i, coref in enumerate(corefs_list):

            # get the first representative mention from the list;
            # if there are no representative mentions, continue
            represent = [
                co['text'] for co in coref if co['isRepresentativeMention']
            ]
            if len(represent) >= 1:
                represent = represent[0]
            else:
                continue

            # loop through the (non-representative) mentions,
            # add to the dictionary list for that sentence
            for co in coref:
                if not co['isRepresentativeMention']:
                    mention = {
                        'represent': represent,
                        'text': co['text'],
                        'startIndex': co['startIndex'],
                        'endIndex': co['endIndex'],
                        'sentNum': co['sentNum']
                    }
                    corefs_dict[co['sentNum']].append(mention)

        return corefs_dict

    def replace_coreferences(self, parsed):
        """
        We want to replace all the references with their
        representative mention.

        Parameters
        ----------
        parsed : dict
            The full output from Stanford, with co-references and sentences.
        """
        corefs = parsed['corefs']
        sents = parsed['sentences']
        corefs_dict = self.restructure_coreference_dict(corefs)

        sents = [[s['word'] for s in sent['tokens']] for sent in sents]
        sents_new = []

        # we do this on a sentence-by-sentence basis
        for sent_i, sent in enumerate(sents, start=1):

            sent_new = []
            # we check to see if the sentence is in the co-reference dictionary;
            # if it's not we won't need to do anything.
            if sent_i in corefs_dict:

                last_end = 0
                # we loop through the (sorted) references and add them
                # to our new sentence list one-by-one, being careful to
                # capture any preceding or ending text
                sorted_sent = sorted(corefs_dict[sent_i],
                                     key=lambda x: x['startIndex'])
                for co_i, co in enumerate(sorted_sent):

                    start = co['startIndex'] - 1
                    end = co['endIndex'] - 1
                    represent = co['represent']

                    # here we want to check whether this is the first co-reference;
                    # if it is, then we need to get any text *before* it
                    if co_i == 0:
                        sent_new.extend(sent[:start])
                        sent_new.append(represent[0].upper() +
                                        represent[1:] if start ==
                                        0 else represent)

                    # otherwise, we just get the co-reference and anything
                    # between it and the preceding end from the previous co-reference
                    else:
                        sent_new.extend(sent[last_end:start])
                        sent_new.append(represent)

                    last_end = end

                sent_new.extend(sent[last_end:])

            else:
                sent_new = sent

            sents_new.append(sent_new)

        # we need to detokenize the sentence; basically this handles
        # putting punctuation and weird symbols for parentheses back together
        sents = ' '.join([
            self.detok.detokenize(sent, convert_parentheses=True)
            for sent in sents_new
        ])
        return sents
Esempio n. 4
0
def prepro_each(args,
                data_type,
                start_ratio=0.0,
                stop_ratio=1.0,
                out_name="default",
                in_path=None):
    if args.tokenizer == "PTB":
        import nltk.tokenize as nltk
        sent_tokenize = nltk.sent_tokenize

        def word_tokenize(tokens):
            return [
                token.replace("''", '"').replace("``", '"')
                for token in nltk.word_tokenize(tokens)
            ]
    elif args.tokenizer == 'Stanford':
        from corenlp import CoreNLPClient
        interface = CoreNLPClient(annotators="tokenize ssplit".split())
    else:
        raise Exception()
    """
    if not args.split:
        sent_tokenize = lambda para: [para]
    """
    source_path = in_path or os.path.join(
        args.source_dir, "{}-{}v1.1.json".format(data_type, args.suffix))
    source_data = json.load(open(source_path, 'r'))

    q, cq, y, rx, rcx, ids, idxs = [], [], [], [], [], [], []
    na = []
    cy = []
    x, cx = [], []
    answerss = []
    p = []
    word_counter, char_counter, lower_word_counter = Counter(), Counter(
    ), Counter()
    start_ai = int(round(len(source_data['data']) * start_ratio))
    stop_ai = int(round(len(source_data['data']) * stop_ratio))
    for ai, article in enumerate(tqdm(source_data['data'][start_ai:stop_ai])):
        xp, cxp = [], []
        pp = []
        x.append(xp)
        cx.append(cxp)
        p.append(pp)
        for pi, para in enumerate(article['paragraphs']):
            # wordss
            context = para['context']
            context = context.replace("''", '" ')
            context = context.replace("``", '" ')
            while True:
                try:
                    temp = interface.annotate(context)
                    break
                except Exception as e:
                    time.sleep(0.2)
            context_s = []
            for sent in temp.sentence:
                sent = [word.originalText for word in sent.token]
                # CoreNLP 가 처리못하는 단어인 \xa0 를 예외처리하기위한 수동 방법...
                for wi in range(len(sent)):
                    if "\xa0" in sent[wi]:
                        sent = sent[:wi] + sent[wi].split("\xa0") + sent[wi +
                                                                         1:]
                        wi = 0
                context_s.append(sent)
            xi = context_s
            xi = [process_tokens(tokens) for tokens in xi]  # process tokens
            # given xi, add chars
            cxi = [[list(xijk) for xijk in xij] for xij in xi]
            xp.append(xi)
            cxp.append(cxi)
            pp.append(context)

            for xij in xi:
                for xijk in xij:
                    word_counter[xijk] += len(para['qas'])
                    lower_word_counter[xijk.lower()] += len(para['qas'])
                    for xijkl in xijk:
                        char_counter[xijkl] += len(para['qas'])

            rxi = [ai, pi]
            assert len(x) - 1 == ai
            assert len(x[ai]) - 1 == pi
            for qa in para['qas']:
                # get words
                while True:
                    try:
                        temp = interface.annotate(qa['question']).sentence[0]
                        break
                    except Exception as e:
                        time.sleep(0.2)
                #print(temp.token[0])
                #exit(-1)
                qi = [t_s.originalText for t_s in temp.token]
                print(qi)
                exit(-1)
                qi = process_tokens(qi)
                cqi = [list(qij) for qij in qi]
                yi = []
                cyi = []
                answers = []
                for answer in qa['answers']:
                    answer_text = answer['text']
                    answers.append(answer_text)
                    answer_start = answer['answer_start']
                    answer_stop = answer_start + len(answer_text)
                    # TODO : put some function that gives word_start, word_stop here
                    yi0, yi1 = get_word_span(context, xi, answer_start,
                                             answer_stop)
                    # yi0 = answer['answer_word_start'] or [0, 0]
                    # yi1 = answer['answer_word_stop'] or [0, 1]
                    assert len(xi[yi0[0]]) > yi0[1]
                    assert len(xi[yi1[0]]) >= yi1[1]
                    w0 = xi[yi0[0]][yi0[1]]
                    w1 = xi[yi1[0]][yi1[1] - 1]
                    i0 = get_word_idx(context, xi, yi0)
                    i1 = get_word_idx(context, xi, (yi1[0], yi1[1] - 1))
                    cyi0 = answer_start - i0
                    cyi1 = answer_stop - i1 - 1
                    # print(answer_text, w0[cyi0:], w1[:cyi1+1])
                    assert answer_text[0] == w0[cyi0], (answer_text, w0, cyi0)
                    assert answer_text[-1] == w1[cyi1]
                    assert cyi0 < 32, (answer_text, w0)
                    assert cyi1 < 32, (answer_text, w1)
                    yi.append([yi0, yi1])
                    cyi.append([cyi0, cyi1])

                if len(qa['answers']) == 0:
                    yi.append([(0, 0), (0, 1)])
                    cyi.append([0, 1])
                    na.append(True)
                else:
                    na.append(False)

                for qij in qi:
                    word_counter[qij] += 1
                    lower_word_counter[qij.lower()] += 1
                    for qijk in qij:
                        char_counter[qijk] += 1

                q.append(qi)
                cq.append(cqi)
                y.append(yi)
                cy.append(cyi)
                rx.append(rxi)
                rcx.append(rxi)
                ids.append(qa['id'])
                idxs.append(len(idxs))
                answerss.append(answers)

        if args.debug:
            break

    word2vec_dict = get_word2vec(args, word_counter)
    lower_word2vec_dict = get_word2vec(args, lower_word_counter)

    # add context here
    data = {
        'q': q,
        'cq': cq,
        'y': y,
        '*x': rx,
        '*cx': rcx,
        'cy': cy,
        'idxs': idxs,
        'ids': ids,
        'answerss': answerss,
        '*p': rx,
        'na': na
    }
    shared = {
        'x': x,
        'cx': cx,
        'p': p,
        'word_counter': word_counter,
        'char_counter': char_counter,
        'lower_word_counter': lower_word_counter,
        'word2vec': word2vec_dict,
        'lower_word2vec': lower_word2vec_dict
    }

    print("saving ...")
    save(args, data, shared, out_name)
class Featurizer():
    def __init__(self, annotators=None, properties=None):
        with open(
                os.path.join(os.path.dirname(__file__), "assets",
                             "regexes.json")) as f:
            self.regexer = RegexFeaturizer(json.load(f))
        self.annotators = annotators or CORENLP_ANNOTATORS
        self.properties = properties or CORENLP_PROPERTIES
        self.client = CoreNLPClient(self.annotators,
                                    properties=self.properties,
                                    endpoint="http://localhost:9012")

    def __enter__(self):
        self.client.__enter__()
        return self

    def __exit__(self, *args):
        self.client.__exit__(*args)

    def _apply_features(self, obj, ann=None):
        """
        Adds features to a graph.
        """
        if "features" not in obj:
            obj["features"] = {}
        if ann:
            assert len(ann.sentence) == 1
            sentence = ann.sentence[0]
            assert len(sentence.token) == len(obj["tokens"])
            obj["features"]["lemma"] = [t.lemma for t in sentence.token]
            obj["features"]["pos"] = [t.pos for t in sentence.token]
            obj["features"]["ner"] = [t.ner for t in sentence.token]
            obj["features"]["depparse"] = _dep_to_list(
                sentence.enhancedPlusPlusDependencies)
            assert len({tail
                        for _, tail, _ in obj["features"]["depparse"]
                        }) == len(sentence.token)
            child_to_head, head_to_child, path_length, next_in_path, distance_to_next_token, distance_from_prev_token = compute_dependency_paths(
                obj)
            obj["features"]["dep_child_to_head"] = child_to_head
            obj["features"]["dep_head_to_child"] = head_to_child
            obj["features"]["dep_path_lengths"] = path_length
            obj["features"]["dep_traceback"] = next_in_path
            obj["features"]["dep_dist_to_next"] = distance_to_next_token
            obj["features"]["dep_dist_from_prev"] = distance_from_prev_token
        if self.regexer:
            obj["features"]["regexes"] = self.regexer.featurize(obj["tokens"])
            obj["features"][
                "typed_values"] = self.regexer.featurize_unit_spans(
                    obj["tokens"])

    def featurize_graph(self, obj):
        ann = self.client.annotate(" ".join(obj["tokens"]))
        self._apply_features(obj, ann)
        return obj

    def featurize_text(self, text):
        ann = self.client.annotate(text)
        assert len(ann.sentence) == 1
        sentence = ann.sentence[0]

        obj = {
            "tokens": [t.word for t in sentence.token],
        }
        self._apply_features(obj, ann)
        return obj