Esempio n. 1
0
    def handle(self, *args, **options):
        spacy_model = spacy.load(settings.SPACY_CUSTOMN_MODEL_FOLDER, disable=['parser', 'tagger', 'textcat'])
        Span.set_extension('is_phone', getter=Command.is_phone_getter, force=True)
        Span.set_extension('line_number', getter=Command.line_number_getter, force=True)
        Doc.set_extension('lines', getter=Command.get_lines, setter=Command.set_lines)
        Doc.set_extension('_lines', default=list())

        logger.debug("Loaded spacy server")
        main_socks, read_socks, write_socks = socket_bind('', settings.SPACY_PORT)
        while True:
            readable, writeable, exceptions = select(read_socks, write_socks, [])
            for sockobj in readable:
                if sockobj in main_socks:
                    new_sock, address = sockobj.accept()
                    logger.debug('Connect: %s - %s', address, id(new_sock))
                    read_socks.append(new_sock)
                else:
                    try:
                        entities = []
                        data = recv_end(sockobj)
                        if not data:
                            sockobj.close()
                            read_socks.remove(sockobj)
                        else:
                            for doc in spacy_model.pipe([data]):
                                doc._.lines = [x.start() for x in re.finditer('\n', doc.text)]
                                for ent in doc.ents:
                                    current_entity = self.get_ent(ent)
                                    entities.append(current_entity) if current_entity else None

                            sockobj.sendall(json.dumps(entities).encode('utf8') + '--end--'.encode('utf8'))
                    except:
                        pass
Esempio n. 2
0
    def set_attributes(self, schema_file: Union[str, Path] = '', encoding: str = None) -> Set:
        """


        The current version SpaCy doesn't differentiate attributes for different annotation types.
        Thus, any attributes extended here will be applied to all Spans.
        @param schema_file: initiate Span attributes using eHOST schema configuration file
        @param encoding: text encoding
        @return: a set of attribute names
        """
        schema_file = self.check_file_validity(schema_file, False)
        attr_names = set()
        attr_conf_start = False
        if schema_file is not None and schema_file.name.endswith("conf"):
            for row in schema_file.read_text(encoding=encoding).split("\n"):
                if len(row.strip()) == 0 or row[0] == '#':
                    continue
                if row.startswith(r'[attributes]'):
                    attr_conf_start = True
                    continue
                elif row[0] == '[':
                    attr_conf_start = False
                if attr_conf_start:
                    # [attributes]
                    # Negation        Arg:<EVENT>
                    # Confidence        Arg:<EVENT>, Value:Possible|Likely|Certain
                    name = row.split('        ')[0]
                    default_value = None
                    if name not in attr_names and not Span.has_extension(name):
                        Span.set_extension(name, default=default_value)
                        attr_names.add(name)
            self.schema_set = True
        return attr_names
Esempio n. 3
0
    def __init__(self, nlp: Language = None, support_overlap: bool = False,
                 log_level: int = logging.WARNING, encoding: str = None, doc_name_depth: int = 0,
                 schema_file: Union[str, Path] = '', store_anno_string: bool = False,
                 **kwargs):
        """

        @param nlp: Spacy Language model
        @param support_overlap: whether need to support overlapped annotations
        @param log_level: logging level configuration
        @param encoding: txt encoding
        @param doc_name_depth: depth of parent directories to add into doc_name
                default is 0: only use file name
                1: use 1 level parent directory name + file name
                -1: use full absolution path
                if you are dealing with multiple directories,this is helpful to
                locate the original files
        @param schema_file: initiate Span attributes using eHOST schema configuration file
        @param store_anno_string: whether read annotated string from annotations to double check parsed Span's correction
        @param kwargs:other parameters
        """
        self.schema_set = False
        self.attr_names = self.set_attributes(schema_file=schema_file, encoding=encoding)
        if store_anno_string:
            if not Span.has_extension("span_txt"):
                Span.set_extension("span_txt", default="")
        super().__init__(nlp=nlp, support_overlap=support_overlap,
                         log_level=log_level, encoding=encoding, doc_name_depth=doc_name_depth,
                         schema_file=schema_file, store_anno_string=store_anno_string, **kwargs)
        pass
Esempio n. 4
0
def add_span_extensions():
    Doc.set_extension("relations", default=None)
    Doc.set_extension("entities", default=None)
    for span_extension in [
            'entity_type', 'entity_id', 'foodon', 'hansard', 'hansardClosest',
            'hansardParent', 'snomedct', 'synonyms'
    ]:
        Span.set_extension(span_extension, default=None)
Esempio n. 5
0
def one_verb(i, sent, doc):
    front = ""
    if (i > 0):
        front = Span(doc, sent.start, i+sent.start)
    end = Span(doc, sent.start+i+1, sent.end-1)

    newfront = ""
    for k in front:
        if k.ent_type_ == "":
            newfront += k.lower_ + " "
        else:
            newfront += k.text + " "

    quest1 = ""
    quest2 = ""
    if sent[i].tag_ == "VBD" and sent[i].lemma_ != "be":
        if find_subject(sent) != "UNKNOWN":
            quest1 = "did"+ " " + find_subject(sent) + " " + sent[i].lemma_
        quest2 = "did"+ " " + str(newfront[:-1]) + " " + sent[i].lemma_

    elif sent[i].tag_ == "VBZ" and sent[i].lemma_ != "be":
        if find_subject(sent) != "UNKNOWN":
            quest1 = "does"+ " " + find_subject(sent) + " " + sent[i].lemma_
        quest2 = "does"+ " " + str(newfront[:-1]) + " " + sent[i].lemma_

    elif sent[i].tag_ == "VBP" or sent[i].tag_ == "VB":
        if find_subject(sent) != "UNKNOWN":
            quest1 = "do"+ " " + find_subject(sent) + " " + sent[i].lemma_
        quest2 = "do"+ " " + str(newfront[:-1]) + " " + sent[i].lemma_

    elif sent[i].tag_ == "VBZ" and sent[i].lemma_ == "be":
        if find_subject(sent) != "UNKNOWN":
            quest1 = sent[i].orth_ + " " + find_subject(sent)
        quest2 = sent[i].orth_ + " " + str(newfront[:-1])

    elif sent[i].tag_ == "VBD" and sent[i].lemma_ == "be":
        if find_subject(sent) != "UNKNOWN":
            quest1 = "was"+ " " + find_subject(sent)
        quest2 = "was"+ " " + str(newfront[:-1])

    elif sent[i].tag_ == "VBN" and sent[i].lemma_ == "be":
        if find_subject(sent) != "UNKNOWN":
            quest1 = sent[i].orth_ + " " + find_subject(sent) + " " + sent[i].lemma_
        quest2 = sent[i].orth_ + " " + str(newfront[:-1]) + " " + sent[i].lemma_

    elif sent[i].tag_ == "VBN" and sent[i].lemma_ == "have":
        if find_subject(sent) != "UNKNOWN":
            quest1 = sent[i].orth_ + " " + find_subject(sent) + " " + sent[i].lemma_
        quest2 = sent[i].orth_ + " " + str(newfront[:-1]) + " " + sent[i].lemma_

    quest1 += " " + str(end) + "?"
    quest2 += " " + str(end) + "?"

    return quest1, quest2
Esempio n. 6
0
 def __init__(self, language: str = "es"):
     """
     Init method
     :param language: language of the annotation
     """
     self.__sentiment_words = load_dict(language, "sentiment_words.csv")
     self.__boosters = load_dict(language, "boosters.csv")
     self.__negations = load_dict(language, "negations.csv")
     Span.set_extension("sentiment_weight", default=0.0, force=True)
     Token.set_extension("sentiment_weight", default=0.0, force=True)
     Token.set_extension("negation_weight", default=1.0, force=True)
     Token.set_extension("booster_weight", default=0.0, force=True)
Esempio n. 7
0
def test_ensure_print_span_characteristics_wont_fail():
    """Test if interface between two methods aren't destroyed if refactored"""
    nlp = English()
    spans_key = "sc"

    pred = Doc(nlp.vocab, words=["Welcome", "to", "the", "Bank", "of", "China", "."])
    pred.spans[spans_key] = [Span(pred, 3, 6, "ORG"), Span(pred, 5, 6, "GPE")]
    ref = Doc(nlp.vocab, words=["Welcome", "to", "the", "Bank", "of", "China", "."])
    ref.spans[spans_key] = [Span(ref, 3, 6, "ORG"), Span(ref, 5, 6, "GPE")]
    eg = Example(pred, ref)

    examples = [eg]
    data = _compile_gold(examples, ["spancat"], nlp, True)
    span_characteristics = _get_span_characteristics(
        examples=examples, compiled_gold=data, spans_key=spans_key
    )
    _print_span_characteristics(span_characteristics)
Esempio n. 8
0
    def __init__(self,
                 nlp: Language = None,
                 support_overlap: bool = False,
                 log_level: int = logging.WARNING,
                 encoding: str = None,
                 doc_name_depth: int = 0,
                 schema_file: Union[str, Path] = '',
                 store_anno_string: bool = False,
                 use_adjudication: bool = False,
                 **kwargs):
        """

        @param nlp: a SpaCy language model
        @param support_overlap: if the EhostDocReader need to support reading from overlapped annotations.
            Because SpaCy's Doc.ents does not allows overlapped Spans, to support overlapping, Spans need to be stored
            somewhere else----Doc._.concepts
        @param log_level: set the logger's logging level. TO debug, set to logging.DEBUG
        @param encoding: txt encoding
        @param doc_name_depth: depth of parent directories to add into doc_name
                default is 0: only use file name
                1: use 1 level parent directory name + file name
                -1: use full absolution path
                if you are dealing with multiple directories,this is helpful to
                locate the original files
        @param schema_file: initiate Span attributes using eHOST schema configuration file
        @param store_anno_string: whether read annotated string from annotations to double check parsed Span's correction
        @param use_adjudication: if read annotations from adjudication folder
        @param kwargs:other parameters
        """
        self.schema_set = False
        self.attr_names = self.set_attributes(schema_file=schema_file,
                                              encoding=encoding)
        if store_anno_string:
            if not Span.has_extension("span_txt"):
                Span.set_extension("span_txt", default="")
        super().__init__(nlp=nlp,
                         support_overlap=support_overlap,
                         log_level=log_level,
                         encoding=encoding,
                         doc_name_depth=doc_name_depth,
                         schema_file=schema_file,
                         store_anno_string=store_anno_string,
                         use_adjudication=use_adjudication,
                         **kwargs)
        pass
    def __call__(self, spacy_span: Span, describer=None):
        """
        convenient wrapper around make_issue if you are using spaCy

        usage example:

        ```python
        from spacy.tokens import Span
        from app.factor import SpacyFactor


        SOV = SpacyFactor(
            "subject_object_verb_spacing",
            "Keep the subject, verb, and object of a sentence close together to help the reader understand the sentence."
        )

        Span.set_extension("score", default=0)
        Span.set_extension("suggestions", default=[])

        doc = nlp("Holders of the Class A and Class B-1 certificates will be entitled to receive on each Payment Date, to the extent monies are available therefor (but not more than the Class A Certificate Balance or Class B-1 Certificate Balance then outstanding), a distribution.")
        score = analyze(doc)
        if score is not None:
            span = Span(doc, 0, len(doc))  # or whichever TOKENS are the issue (don't have to worry about character indexes)
            span._.score = score
            span._.suggestions = get_suggestions(doc)
            issues = SOV(span)
        ```
        """
        text, start, end = spacy_span.text, spacy_span.start_char, spacy_span.end_char
        score = spacy_span._.score if spacy_span.has_extension("score") else 0
        suggestions = (spacy_span._.suggestions
                       if spacy_span.has_extension("suggestions") else [])
        if describer:
            description = describer(spacy_span)
        else:
            description = self.description
        return make_issue(
            text,
            start,
            end,
            issue_type=self.issue_type,
            score=score,
            description=description,
            suggestions=suggestions,
        )
Esempio n. 10
0
    def __init__(self,
                 first_name_extension_name=FirstNameListMatcher.EXTENSION_NAME,
                 last_name_extension_name=LastNameListMatcher.EXTENSION_NAME):

        self.token_extension_name = self.TOKEN_EXTENSION_NAME
        self.span_extension_name = self.SPAN_EXTENSION_NAME
        self.doc_extension_name = self.DOC_EXTENSION_NAME
        self.first_name_extension_name = first_name_extension_name
        self.last_name_extension_name = last_name_extension_name

        if not Token.has_extension(self.token_extension_name):
            Token.set_extension(self.token_extension_name,
                                default=self.ANOT_NONE)
        if not Span.has_extension(self.span_extension_name):
            Span.set_extension(self.span_extension_name,
                               getter=self.is_full_name_getter)
        if not Doc.has_extension(self.doc_extension_name):
            Doc.set_extension(self.doc_extension_name, default=[])
Esempio n. 11
0
def to_and_from(doc):
    new_ents = []
    for ent in doc.ents:
        # Only check for title if it's a station and not the first token
        if ent.label_ == "STN" and ent.start != 0:
            prev_token = doc[ent.start - 1]
            if prev_token.text.lower() in ("to", "at"):
                new_ent = Span(doc, ent.start, ent.end, label="ARR")
                new_ents.append(new_ent)
            elif prev_token.text.lower() in "from":
                new_ent = Span(doc, ent.start, ent.end, label="DEP")
                new_ents.append(new_ent)
            else:
                new_ents.append(ent)
        else:
            new_ents.append(ent)
    doc.ents = new_ents
    return doc
Esempio n. 12
0
def test_get_span_characteristics_return_value():
    nlp = English()
    spans_key = "sc"

    pred = Doc(nlp.vocab, words=["Welcome", "to", "the", "Bank", "of", "China", "."])
    pred.spans[spans_key] = [Span(pred, 3, 6, "ORG"), Span(pred, 5, 6, "GPE")]
    ref = Doc(nlp.vocab, words=["Welcome", "to", "the", "Bank", "of", "China", "."])
    ref.spans[spans_key] = [Span(ref, 3, 6, "ORG"), Span(ref, 5, 6, "GPE")]
    eg = Example(pred, ref)

    examples = [eg]
    data = _compile_gold(examples, ["spancat"], nlp, True)
    span_characteristics = _get_span_characteristics(
        examples=examples, compiled_gold=data, spans_key=spans_key
    )

    assert {"sd", "bd", "lengths"}.issubset(span_characteristics.keys())
    assert span_characteristics["min_length"] == 1
    assert span_characteristics["max_length"] == 3
Esempio n. 13
0
    def add_new_matching_span(self, document, span):
        token_end_index, token_start_index = self.calculate_span_indices(document, span)
        length = span[1] - span[0]

        if self.config.minimum_length is not None and self.config.minimum_length > length:
            return
        if self.config.maximum_length is not None and self.config.maximum_length < length:
            return

        spacy_span = Span(document, token_start_index, token_end_index, self.config.label)
        document.ents = document.ents + (spacy_span,)
Esempio n. 14
0
    def __init__(self, links, **kwargs):
        self.start_urls.append(links)

        import spacy
        from spacy.tokens.doc import Doc
        from spacy.tokens.span import Span

        self.spacy_model = spacy.load(settings.SPACY_CUSTOMN_MODEL_FOLDER)
        Span.set_extension('line_number',
                           getter=TagLinkSpider.line_number_getter,
                           force=True)
        Doc.set_extension('lines',
                          getter=TagLinkSpider.get_lines,
                          setter=TagLinkSpider.set_lines)
        Doc.set_extension('_lines', default=list())

        self.soc_spacy = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
        self.soc_spacy.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
        connect(self.soc_spacy, '', settings.SPACY_PORT)

        super().__init__(**kwargs)
Esempio n. 15
0
def time_detection(doc):
    new_ents = []
    for ent in doc.ents:
        if ent.label_ == "CARDINAL":
            next_token = doc[ent.start + 1]
            if next_token.text.lower() in ("pm", "p.m", "p.m.", "am", "a.m",
                                           "a.m."):
                new_ent = Span(doc, ent.start, ent.end + 1, label="TIME")
                new_ents.append(new_ent)
            else:
                new_ents.append(ent)
        else:
            new_ents.append(ent)
    doc.ents = new_ents
    return doc
Esempio n. 16
0
    def set_attributes(self,
                       schema_file: Union[str, Path] = '',
                       encoding: str = None) -> Set:
        """


        The current version SpaCy doesn't differentiate attributes for different annotation types.
        Thus, any attributes extended here will be applied to all Spans.
        @param schema_file: initiate Span attributes using eHOST schema configuration file
        @param encoding: text encoding
        @return: a set of attribute names
        """
        schema_file = self.check_file_validity(schema_file, False)
        attr_names = set()
        if schema_file is not None:
            root = etree.parse(str(schema_file.absolute()))
            for attr_def in root.iter("attributeDef"):
                name = attr_def[0].text.replace(' ', '_')
                default_value = attr_def[2].text
                if name not in attr_names and not Span.has_extension(name):
                    Span.set_extension(name, default=default_value)
                    attr_names.add(name)
            self.schema_set = True
        return attr_names
def enable_spacy_extensions():
    """Enables custom extensions for spaCy for dealing with citations."""
    Token.set_extension('is_in_text_citation', default=False, force=True)
    Span.set_extension('tokens_without_citations',
                       getter=get_span_tokens_without_citations,
                       force=True)
    Span.set_extension('text_without_citations',
                       getter=get_span_text_without_citations,
                       force=True)
    Span.set_extension('text_with_ws_without_citations',
                       getter=get_span_text_with_ws_wo_cites,
                       force=True)
Esempio n. 18
0
def test_debug_data_compile_gold_for_spans():
    nlp = English()
    spans_key = "sc"

    pred = Doc(nlp.vocab, words=["Welcome", "to", "the", "Bank", "of", "China", "."])
    pred.spans[spans_key] = [Span(pred, 3, 6, "ORG"), Span(pred, 5, 6, "GPE")]
    ref = Doc(nlp.vocab, words=["Welcome", "to", "the", "Bank", "of", "China", "."])
    ref.spans[spans_key] = [Span(ref, 3, 6, "ORG"), Span(ref, 5, 6, "GPE")]
    eg = Example(pred, ref)

    data = _compile_gold([eg], ["spancat"], nlp, True)

    assert data["spancat"][spans_key] == Counter({"ORG": 1, "GPE": 1})
    assert data["spans_length"][spans_key] == {"ORG": [3], "GPE": [1]}
    assert data["spans_per_type"][spans_key] == {
        "ORG": [Span(ref, 3, 6, "ORG")],
        "GPE": [Span(ref, 5, 6, "GPE")],
    }
    assert data["sb_per_type"][spans_key] == {
        "ORG": {"start": [ref[2:3]], "end": [ref[6:7]]},
        "GPE": {"start": [ref[4:5]], "end": [ref[6:7]]},
    }
Esempio n. 19
0

def doc_findall(doc: Doc, pattern, flags=0) -> List[Span]:
    return list(doc._.finditer(pattern, flags=flags))


Doc.set_extension('findall', method=doc_findall)

# # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
# Span extensions for standard regex functions 'finditer' and 'findall'
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # #


def span_finditer(span: Span, pattern, flags=0):
    span_start_idx = span[0].idx
    for m in re.finditer(pattern=pattern, string=span.text, flags=flags):
        start, end = m.span()
        start += span_start_idx
        end += span_start_idx
        yield span.doc._.idxs2span(start, end)


Span.set_extension('finditer', method=span_finditer)


def span_findall(span, pattern, flags=0):
    return list(span._.finditer(pattern, flags=flags))


Span.set_extension('findall', method=span_findall)
Esempio n. 20
0
 def process_without_overlaps(self, doc: Doc,
                              sorted_spans: _OrderedDictItemsView,
                              classes: OrderedDict, attributes: OrderedDict,
                              relations: OrderedDict) -> Doc:
     """:arg a SpaCy Doc, can be overwriten by the subclass as needed.
         This function will add spans to doc.ents (defined by SpaCy as default)
         which doesn't allow overlapped annotations.
         @param doc: initiated SpaCy Doc
         @param sorted_spans: a sorted OrderedDict Items ( spans[entity_id] = (start, end, span_text))
         @param classes: a OrderedDict to map a entity id to [entity label, [attr_ids]]
         @param attributes: a OrderedDict to map a attribute id to (attribute_name, attribute_value)
         @param relations: a OrderedDict to map a relation_id to (label, (relation_component_ids))
         @return: annotated Doc
     """
     existing_entities = list(doc.ents)
     new_entities = list()
     # token_left_bound = 0
     token_right_bound = len(doc) - 1
     token_start = -1
     token_end = -1
     for id, span_tuple in sorted_spans:
         # because SpaCy uses token offset instead of char offset to define Spans, we need to match them,
         # binary search is used here to speed up
         if self.store_anno_string:
             start, end, span_txt = span_tuple
         else:
             start, end = span_tuple
         # because SpaCy uses token offset instead of char offset to define Spans, we need to match them,
         # binary search is used here to speed up
         if start < doc[0].idx:
             # If the annotation fall into a span that is before the 1st Spacy token, adjust the span to the 1st
             # token
             token_start = 0
             token_end = 1
         elif token_start >= token_right_bound:
             # If the annotation fall into a span that is after the last Spacy token, adjust the span to the last
             # token
             token_start = token_right_bound - 1
             token_end = token_right_bound
         else:
             token_start = self.find_start_token(start, token_start,
                                                 token_right_bound, doc)
             if end >= doc[-1].idx + doc[-1].__len__():
                 token_end = token_right_bound + 1
             else:
                 token_end = self.find_end_token(end, token_start,
                                                 token_right_bound, doc)
         if token_start < 0 or token_start >= token_right_bound or token_end < 0 or token_end > token_right_bound:
             raise ValueError(
                 "It is likely your annotations overlapped, which process_without_overlaps doesn't support parsing "
                 "those. You will need to initiate the EhostDocReader with 'support_overlap=True' in the arguements"
             )
         if token_start >= 0 and token_end > 0:
             span = Span(doc, token_start, token_end, label=classes[id][0])
             for attr_id in classes[id][1]:
                 if attr_id not in attributes:
                     continue
                 attr_name = attributes[attr_id][0]
                 attr_value = attributes[attr_id][1]
                 setattr(span._, attr_name, attr_value)
             if self.store_anno_string and span_txt is not None:
                 setattr(span._, "span_txt", span_txt)
             new_entities.append(span)
             token_start = token_end
         else:
             raise OverflowError(
                 'The span of the annotation: {}[{}:{}] is out of document boundary.'
                 .format(classes[id][0], start, end))
         pass
     doc.ents = existing_entities + new_entities
     return doc
    span_attn_getter,
    span_nctokens_getter,
    span_wp2ncid_getter,
    span_wp2tokid_getter,
    span_wp_getter,
    span_wp_slice_getter,
)

Doc.set_extension("wp2ncid", getter=doc_wp2ncid_getter)
Doc.set_extension("nctokens", getter=doc_nctokens_getter)
Doc.set_extension("tokid2nc", getter=doc_tokid2nc_getter)
Doc.set_extension("wp2tokid", getter=doc_wp2tokid_getter)
Doc.set_extension("tokid2ncid", getter=doc_tokid2ncid_getter)
Doc.set_extension("tokid2wp", getter=doc_tokid2wp_getter)

Span.set_extension("wp_slice", getter=span_wp_slice_getter)
Span.set_extension("wp2tokid", getter=span_wp2tokid_getter)
Span.set_extension("attention", getter=span_attn_getter)
Span.set_extension("wordpieces", getter=span_wp_getter)
Span.set_extension("wp2ncid", getter=span_wp2ncid_getter)
Span.set_extension("nctokens", getter=span_nctokens_getter)


def load_danish(spacy_model: str = "da_core_news_sm",
                transformer: str = "Maltehb/danish-bert-botxo"):
    nlp = spacy.load(spacy_model)

    if transformer:
        # add transformer
        # Construction via add_pipe with custom config
        config = {
Esempio n. 22
0
 def f(doc: Doc) -> Doc:
     doc.ents += (Span(doc, 3, 5, "foo"), )
     return doc
Esempio n. 23
0
    def _pseudofy_side(self,
                       rel: brat_data.Relation,
                       sentence: Span,
                       k: int,
                       do_left=True) -> SentenceGenerator:

        rel = _make_contig_rel(rel)
        if not rel:
            return
        # _make_contig_rel does make a deep copy but no interesting changes have been made yet
        logging.info(f'Original instance: {rel}')

        ent, other_ent = (rel.arg1, rel.arg2) if do_left else (rel.arg2,
                                                               rel.arg1)

        text = str(sentence)
        span_start = sentence.start_char

        start, end = adjust_spans(ent, -span_start)
        adjust_spans(other_ent, -span_start)

        try:
            original_pos = [t.pos_ for t in sentence.char_span(start, end)]
        except TypeError:
            # The char span doesn't line up with any tokens,
            # thus we can't figure out if the prediction is the right POS
            logging.info(
                'Instance rejected; the spans given do not align with tokens according to the spaCy model'
            )
            return None

        masked_sentence = text[:start] + '[MASK]' + text[end:]
        tokenized_sentence = self.bert_tokenizer.tokenize(masked_sentence)
        indexed_tokens = self.bert_tokenizer.convert_tokens_to_ids(
            tokenized_sentence)
        token_tensor = torch.tensor(indexed_tokens)
        mask_tensor = torch.tensor(
            [token != '[MASK]' for token in tokenized_sentence],
            dtype=torch.float)

        if len(token_tensor) > 512:
            # This is the token limit we report on, but the limit depends on the BERT model
            return None

        with torch.no_grad():
            result = self.bert(token_tensor.unsqueeze(0),
                               mask_tensor.unsqueeze(0),
                               masked_lm_labels=None)

        result = result[0].squeeze(0)
        scores = torch.softmax(result, dim=-1)
        mask_index = tokenized_sentence.index('[MASK]')

        topk_scores, topk_indices = torch.topk(scores[mask_index, :],
                                               k,
                                               sorted=True)
        topk_tokens = self.bert_tokenizer.convert_ids_to_tokens(topk_indices)

        for token, score in zip(topk_tokens, topk_scores):
            new_sent = text[:start] + token + text[end:]
            new_doc = self.nlp(new_sent)
            new_span = new_doc.char_span(start, start + len(token))

            if new_span is None:
                continue

            pos_match = [t.pos_ for t in new_span] == original_pos

            this_rel = deepcopy(rel)
            ent, other_ent = (this_rel.arg1,
                              this_rel.arg2) if do_left else (this_rel.arg2,
                                                              this_rel.arg1)

            ent.spans = [(start, start + len(token))]

            if ent.start < other_ent.start:
                # If the entity being changed comes before the one not being changed, the spans of the other must
                # also be adjusted; it is not guaranteed that `rel.arg1` always comes before `rel.arg2`
                new_offset = len(token) - len(ent.mention)
                adjust_spans(other_ent, new_offset)

            ent.mention = token

            new_ps = PseudoSentence(this_rel, new_sent, float(score),
                                    pos_match)
            logging.info(f'New instance: {new_ps}')
            yield new_ps
Esempio n. 24
0
    def process_support_overlaps(self, doc: Doc,
                                 sorted_spans: _OrderedDictItemsView,
                                 classes: OrderedDict, attributes: OrderedDict,
                                 relations: OrderedDict) -> Doc:
        """:arg a SpaCy Doc, can be overwriten by the subclass as needed.
            This function will add spans to doc._.concepts (defined in 'read' function above,
            which allows overlapped annotations.
            @param doc: initiated SpaCy Doc
            @param sorted_spans: a sorted OrderedDict Items ( spans[entity_id] = (start, end, span_text))
            @param classes: a OrderedDict to map a entity id to [entity label, [attr_ids]]
            @param attributes: a OrderedDict to map a attribute id to (attribute_name, attribute_value)
            @param relations: a OrderedDict to map a relation_id to (label, (relation_component_ids))
            @return: annotated Doc
        """
        existing_concepts: dict = doc._.concepts
        # token_left_bound = 0
        previous_abs_end = 0
        token_right_bound = len(doc) - 1
        token_start = -1
        token_end = -1
        for id, span_tuple in sorted_spans:
            # because SpaCy uses token offset instead of char offset to define Spans, we need to match them,
            # binary search is used here to speed up
            if self.store_anno_string:
                start, end, span_txt = span_tuple
            else:
                start, end = span_tuple
            if start < doc[0].idx:
                # If the annotation fall into a span that is before the 1st Spacy token, adjust the span to the 1st
                # token
                token_start = 0
                token_end = 1
            elif token_start >= token_right_bound:
                # If the annotation fall into a span that is after the last Spacy token, adjust the span to the last
                # token
                self.logger.debug(
                    "token_start {} >= token_right_bound {}".format(
                        token_start, token_right_bound))
                token_start = token_right_bound
                token_end = token_right_bound + 1
            else:
                # if start < previous_abs_end:
                #     self.logger.debug("To find {} between token_start - 1({}[{}]) and  token_right_bound({}[{}])"
                #                       .format(start, token_start-1, doc[token_start-1].idx,
                #                               token_right_bound, doc[token_right_bound].idx), )
                #     token_start = self.find_start_token(start, token_start - 1 if token_start > 0 else 0,
                #                                         token_right_bound, doc)
                #     self.logger.debug('\tfind token_start={}[{}]'.format(token_start, doc[token_start].idx))
                #
                # else:
                self.logger.debug(
                    "To find {} between token_start ({}[{}]) and  token_right_bound({}[{}])"
                    .format(start, token_start, doc[token_start].idx,
                            token_right_bound, doc[token_right_bound].idx), )
                token_start = self.find_start_token(start, token_start,
                                                    token_right_bound, doc)
                self.logger.debug("\tfind start token {}('{}')".format(
                    token_start, doc[token_start]))
                if end >= doc[-1].idx + doc[-1].__len__():
                    self.logger.debug(
                        "end  ({}) >= doc[-1].idx ({}) + doc[-1].__len__() ({})"
                        .format(end, doc[-1].idx, doc[-1].__len__()))
                    token_end = token_right_bound + 1
                else:
                    self.logger.debug(
                        "To find token_end starts from {} between token_start ({}[{}]) and  token_right_bound({}[{}])"
                        .format(end, token_start, doc[token_start].idx,
                                token_right_bound, doc[token_right_bound].idx))
                    token_end = self.find_end_token(end, token_start,
                                                    token_right_bound, doc)
                    self.logger.debug("\tFind end token {}('{}')".format(
                        token_end, doc[token_end]))
            if token_start >= 0 and token_end > 0:
                span = Span(doc, token_start, token_end, label=classes[id][0])
                if self.logger.isEnabledFor(logging.DEBUG):
                    import re
                    if re.sub('\s+', ' ', span._.span_txt) != re.sub(
                            '\s+', ' ', str(span)):
                        self.logger.debug('{}[{}:{}]\n\t{}<>\n\t{}<>'.format(
                            classes[id][0], token_start, token_end,
                            re.sub('\s+', ' ', span._.span_txt),
                            re.sub('\s+', ' ', str(span))))
                for attr_id in classes[id][1]:
                    if attr_id not in attributes:
                        continue
                    attr_name = attributes[attr_id][0]
                    attr_value = attributes[attr_id][1]
                    setattr(span._, attr_name, attr_value)
                if self.store_anno_string and span_txt is not None:
                    setattr(span._, "span_txt", span_txt)
                if classes[id][0] not in existing_concepts:
                    existing_concepts[classes[id][0]] = list()
                existing_concepts[classes[id][0]].append(span)
                # token_start = token_end
                previous_abs_end = token_start

            else:
                raise OverflowError(
                    'The span of the annotation: {}[{}:{}] is out of document boundary.'
                    .format(classes[id][0], start, end))
            pass
        return doc
#print("*) PRINT Spacy NER Formatted string - 02")
#for i in range(0,len(ner_ita_spacy_doc.ents)):
#    ner_ita_phrase_01_ner_formatted = ner_ita_phrase_01_ner_formatted.replace(ner_ita_spacy_doc.ents[i].text, ner_ita_spacy_doc.ents[i].text+"("+ner_ita_spacy_doc.ents[i].label_+")")

print("*) Spacy ENTITY CORRECTION - ITA")
'''
Obtaing hash of labels with spacy

'''
money_tag = ner_ita_spacy_doc.vocab.strings["MONEY"]
print(money_tag)
print("*) Original phrase")
print(ner_ita_phrase_01)
print(ner_ita_spacy_doc.ents)
## IMPO: SPACY DEFINE AN ADHOC NER ENTITY - MONEY
it_money_ner = Span(ner_ita_spacy_doc, 14, 18, label=money_tag)
print(it_money_ner)

ner_ita_spacy_doc.ents = list(ner_ita_spacy_doc.ents) + [it_money_ner]
print(ner_ita_spacy_doc.ents)
ner_ita_phrase_01_ner_formatted = ner_ita_phrase_01
#for i in range(0,len(ner_ita_spacy_doc.ents)):
#    ner_ita_phrase_01_ner_formatted = ner_ita_phrase_01_ner_formatted.replace(ner_ita_spacy_doc.ents[i].text, ner_ita_spacy_doc.ents[i].text+"("+ner_ita_spacy_doc.ents[i].label_+")")
ner_ita_phrase_01_ner_formatted = ner_ita_phrase_01
for i in range(0, len(ner_ita_spacy_doc.ents)):
    ner_ita_phrase_01_ner_formatted = ner_ita_phrase_01_ner_formatted.replace(
        ner_ita_spacy_doc.ents[i].text, ner_ita_spacy_doc.ents[i].text + "(" +
        ner_ita_spacy_doc.ents[i].label_ + ")")

print("*) PRINT Spacy NER Formatted string - 01 - + new ENTITY - MONEY")
print(ner_ita_phrase_01_ner_formatted)
Esempio n. 26
0
import ahocorasick
import spacy
import textspan
from spacy.tokens import Doc
from spacy.tokens.span import Span
from spacy.util import filter_spans
from typing_extensions import Literal

from camphr.utils import SerializationMixin, get_doc_char_span

# Sometimes matched text is different from original text
# since `PatternSearcher` can match the `lemma`.
# This extension holds the matched text.
PATTERN_MATCH_AS = "pattern_match_as"
Span.set_extension(PATTERN_MATCH_AS, default=None, force=True)


@spacy.component("pattern_searcher")
class PatternSearcher(SerializationMixin):
    serialization_fields = [
        "model",
        "label_type",
        "custom_label",
        "custom_label_map",
        "destructive",
        "lemma",
        "lower",
        "cfg",
        "normalizer",
    ]
Esempio n. 27
0
    def __call__(self, doc):
        #ToDo Could use refactoring for readability and structure, as it's too complex now
        full_name_spans = []
        min_span_size = 2
        max_span_size = 2

        is_first_char_capped_list = [
            doc[i].text[0].isupper() for i in range(len(doc))
        ]

        i = 0
        while i < len(doc) - 1:
            is_first_name_this = doc[i]._.get(self.first_name_extension_name)
            is_last_name_next = doc[i + 1]._.get(self.last_name_extension_name)
            both_capped = is_first_char_capped_list[
                i] and is_first_char_capped_list[i + 1]
            if is_first_name_this and is_last_name_next and both_capped:
                doc[i]._.set(self.token_extension_name, self.ANOT_INIT)
                doc[i + 1]._.set(self.token_extension_name, self.ANOT_OTHER)
                new_span = Span(doc, i, i + 1, label=self.SPAN_LABEL)
                span_start = i
                span_end = i + 1

                #look-back for more first names
                first_first_name = i
                while first_first_name-1>= 0 and doc[first_first_name-1]._.get(self.first_name_extension_name)\
                        and is_first_char_capped_list[first_first_name-1]:
                    first_first_name -= 1
                span_start = first_first_name
                if first_first_name != i:  #name starts earlier
                    doc[i]._.set(self.token_extension_name, self.ANOT_OTHER)
                    doc[first_first_name]._.set(self.token_extension_name,
                                                self.ANOT_INIT)
                    for j in range(first_first_name + 1, i):
                        doc[first_first_name]._.set(self.token_extension_name,
                                                    self.ANOT_OTHER)
                # Check if considering the last name as a first name would still yield a valid match
                #   when adding the following word
                look_ahead_counter = 1
                while (i+1+look_ahead_counter) < len(doc) \
                        and doc[i+look_ahead_counter]._.get(self.first_name_extension_name) \
                        and doc[i+1+look_ahead_counter]._.get(self.last_name_extension_name)\
                        and is_first_char_capped_list[i+1+look_ahead_counter]:
                    doc[i + 1 + look_ahead_counter]._.set(
                        self.token_extension_name, self.ANOT_OTHER)
                    look_ahead_counter += 1
                    span_end += 1
                #Check if it can be extended with more last names
                while (span_end + 1 < len(doc) and doc[span_end + 1]._.get(
                        self.last_name_extension_name)
                       and is_first_char_capped_list[span_end + 1]):
                    doc[span_end + 1]._.set(self.token_extension_name,
                                            self.ANOT_OTHER)
                    span_end += 1
                full_name_spans.append(new_span)
                i = span_end + 1
            else:
                i += 1

        doc._.set(self.doc_extension_name, full_name_spans)

        return doc
Esempio n. 28
0
    def __init__(self, txt_path):
        super(ASK, self).__init__()

        f_path = txt_path 


        with open(f_path, 'r') as f:
            txt = f.readlines()

        txt = [x.strip() for x in txt]

        sentence_regex = "(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s"
        regex = re.compile(sentence_regex)

        txt = [removeParentheses(x) for x in txt if regex.search(x)]
        txt = ''.join(str(elem) for elem in txt)
        self.txt = txt
        nlp = spacy.load('en')
        doc = nlp(unicode(txt))

        questions = []

        words = []
        for word in doc:
            if word.pos_ == "NOUN" or word.pos_ == "VERB":
                words.append(word.lemma_)

        count = Counter(words)

        for sent in doc.sents:
            rel = 0
            for word in sent:
                rel += count.get(word.lemma_, 0)

            rel = rel/float(len(sent))

            if sent[0].text == "This":
                remain = Span(doc, sent.start+1, sent.end-1)
                questions.append(("What " + remain.text + "?", rel+10))

            # What questions
            if (sent.root.lemma_ == "be"):
                for r in sent.root.rights:
                    join = ""
                    first = True
                    for w in r.subtree:
                        if first and w.ent_type_ == "":
                            join += ' ' + w.lower_
                        else:
                            first = False
                            join += ' ' + w.text
                    if join[-1] == ",":
                        join = join[:-2]
                    what_question_1 = "What " + sent.root.text + " " + join + "?"
                    questions.append((what_question_1, rel))
                    break
                for r in sent.root.lefts:
                    join = ""
                    first = True
                    for w in r.subtree:
                        if first and w.ent_type_ == "":
                            join += ' ' + w.lower_
                        else:
                            first = False
                            join += ' ' + w.text
                    if join[-1] == ",":
                        join = join[:-2]
                    what_question_2 = "What " + sent.root.text + " " + join + "?"
                    questions.append((what_question_2, rel))

            # Who questions
            subject = ["he", "she"]
            for i in range(0, len(sent)-1) :
                if sent[i].dep_ == "nsubj" and sent[i].ent_type_ == "PERSON" or sent[i].text in subject:
                    if i > 0:
                        start = Span(doc, sent.start, i+sent.start-1)
                    else:
                        start = Span(doc, sent.start, sent.start)
                    i = i+1
                    while i < len(sent)-1:
                        if sent[i].ent_type_ == "PERSON":
                            i = i+1
                        elif sent[i].dep_ == "nsubj":
                            i = i+1
                        else:
                            break
                    end = Span(doc, i + sent.start, sent.end-1)
                    if (len(start) == 0):
                        who_question = "Who " + end.text + "?"
                        questions.append((who_question, rel))
                    else:
                        who_question = "Who " + end.text + "?"
                        questions.append((who_question, rel))
                    break

            # When questions
            # Only works when original sentence is "In ____, blah blah."
            for i in range(0, len(sent)-1):
                if (sent[i].ent_type_ == "DATE" and sent[i].pos_ != "ADJ"):
                    hi = Span(doc, sent.start, i+sent.start)
                    head = sent[i].head
                    while i < len(sent) - 1 and (sent[i].ent_type_ == "DATE" or sent[i].pos_ == "PUNCT"):
                        i = i+1
                    end = Span(doc, i + sent.start, sent.end-1)
                    verb = sent[i]
                    for t in sent.root.lefts:
                        verb = t
                    if verb.lemma_ == "be":
                        final = "When was "
                    else:
                        final = "When did "
                    if len(end) > 0 and (end[0].pos_ == "NOUN" or end[0].pos_ == "DET"):
                        for token in end:
                            if verb.lemma_ == "be" and token.lemma_ == "be":
                                final = final
                            elif verb.lemma_ != "be" and token == sent.root:
                                final = final + sent.root.lemma_ + " "
                            else:
                                final = final + token.orth_ + " "
                        when_question_1 = final[:-1] + "?"
                        questions.append((when_question_1, rel))
                    break

            for i in range(0, len(sent)-1):
                if (sent[i].ent_type_ == "DATE"):
                    if (i > 0):
                        front = Span(doc, sent.start, i+sent.start-1)
                    else:
                        font = []
                    valid = False
                    while i < len(sent) - 1 and (sent[i].ent_type_ == "DATE" or sent[i].pos_ == "PUNCT"):
                        i = i+1
                        valid = True
                    if valid:
                        end = Span(doc, sent.start + i, sent.end-1)
                        verb = sent[i]
                        for t in sent.root.lefts:
                            verb = t
                        if verb.lemma_ == "be":
                            final = "When was "
                        else:
                            final = "When did "
                        if len(end) > 0 and (end[0].pos_ == "NOUN" or end[0].pos_ == "DET"):
                            for token in front:
                                if verb.lemma_ == "be" and token.lemma_ == "be":
                                    final = final
                                elif verb.lemma_ != "be" and token == sent.root:
                                    final = final + sent.root.lemma_ + " "
                                elif token == front[0] and token.ent_type_ == "":
                                    final = final + token.lower_ + " "
                                else:
                                    final = final + token.orth_ + " "
                            for token in end:
                                if verb.lemma_ == "be" and token.lemma_ == "be":
                                    final = final
                                elif verb.lemma_ != "be" and token == sent.root:
                                    final = final + sent.root.lemma_ + " "
                                else:
                                    final = final + token.orth_ + " "
                            when_question_2 = final[:-1] + "?"
                            questions.append((when_question_2, rel))
                    break

            # Where questions
            wheretag = ["GPE", "LOC", "FACILTY", "ORG"]
            for i in range(0, len(sent)-1):
                if (sent[i].ent_type_ in wheretag) and (sent[i-1].ent_type_ not in wheretag) and (sent[i-1].tag_ == "IN"):

                    oneloc =  " " + str(sent[i-1])+ " " + str(sent[i])
                    j = i
                    while j < len(sent)-1 and sent[j+1].ent_type_ in wheretag:
                        oneloc += " " + str(sent[j+1])
                        j = j + 1
                    i = j

                    where_question1 = ""
                    where_question2 = ""

                    for k in range(0, len(sent)-1):
                        if sent[k] == sent.root and (sent[k-1].pos_ == 'VERB'):
                            where_question1 = "Where" + " " + two_verbs(k, sent, doc)[0]
                            where_question2 = "Where" + " " + two_verbs(k, sent, doc)[1]

                        elif sent[k] == sent.root:
                            where_question1 = "Where" + " " + one_verb(k, sent, doc)[0]
                            where_question2 = "Where" + " " + one_verb(k, sent, doc)[1]

                    where_question1 = where_question1.replace(oneloc, "")
                    where_question2 = where_question2.replace(oneloc, "")
                    questions.append((where_question1, rel))
                    questions.append((where_question2, rel))
                    break

        # DO/ DID DOES HAVE WAS IS questions
            for i in range(0, len(sent)-1):
                vquest1 = ""
                vquest2 = ""

                if sent[i] == sent.root and (sent[i-1].pos_ == 'VERB'):
                    vquest1 = (two_verbs(i, sent, doc)[0])
                    vquest1 = vquest1[0].capitalize() + vquest1[1:]
                    vquest2 = (two_verbs(i, sent, doc)[1])
                    vquest2 = vquest2[0].capitalize() + vquest2[1:]

                    questions.append((vquest1, rel))
                    questions.append((vquest2, rel))
                    break

                elif sent[i] == sent.root:
                    vquest1 = (one_verb(i, sent, doc)[0])
                    vquest1 = vquest1[0].capitalize() + vquest1[1:]
                    vquest2 = (one_verb(i, sent, doc)[1])
                    vquest2 = vquest2[0].capitalize() + vquest2[1:]

                    questions.append((vquest1, rel))
                    questions.append((vquest2, rel))
                    break

        pronouns = ["he", "she", "his", "her"]
        questions = sorted(questions, key=lambda x: x[1], reverse=True)
        goodQuestions = []
        goodPronouns = []
        for q in questions:
            if (q[0].count(' ') > 3):
                sentence = q[0]
                try:
                    matches = tool.check(sentence)
                    if len(matches)==0:
                        words = q[0].split()
                        if not any(p == w for w in words for p in pronouns):
                            goodQuestions.append(sentence)
                        else:
                            goodPronouns.append(sentence)
                except:
                    words = q[0].split()
                    if not any(p == w for w in words for p in pronouns):
                        goodQuestions.append(sentence)
                    else:
                        goodPronouns.append(sentence)

        count = 0
        for i in range(0, 4):
            for j in range(0, len(goodQuestions)/4):
                if count < qNum:
                    count = count+1
                    print goodQuestions[j*4+i]

        for i in range(0, 4):
            for j in range(0, len(goodPronouns)/4):
                if count < qNum:
                    count = count+1
                    print goodPronouns[j*4+i]
Esempio n. 29
0
    def __init__(self, txt_path):
        super(NER, self).__init__()

        cur_path = os.path.dirname(__file__)
        rel_path = '../data/' + txt_path
        f_path = os.path.join(cur_path, rel_path)


        with open(f_path, 'r') as f:
            txt = f.readlines()

        txt = [x.strip() for x in txt]

        sentence_regex = "(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s"
        regex = re.compile(sentence_regex)

        txt = [x for x in txt if regex.search(x)]
        txt = ''.join(str(elem) for elem in txt)
        self.txt = txt
        nlp = spacy.load('en')
        doc = nlp(unicode(txt))

        questions = []
        when = []

        words = []
        for word in doc:
            if word.pos_ == "NOUN" or word.pos_ == "VERB":
                words.append(word.lemma_)

        count = Counter(words)

        for sent in doc.sents:
            rel = 0
            for word in sent:
                rel += count.get(word.lemma_, 0)

            rel = rel/float(len(sent))

            # What questions
            if (sent.root.lemma_ == "be"):
                for r in sent.root.rights:
                    join = ' '.join(w.text for w in r.subtree)
                    if join[-1] == ",":
                        join = join[:-2]
                    what_question_1 = "What " + sent.root.text + " " + join + "?"
                    questions.append((format_question(what_question_1), rel+gscore(what_question_1)))
                    break
                for r in sent.root.lefts:
                    join = ' '.join(w.text for w in r.subtree)
                    if join[-1] == ",":
                        join = join[:-2]
                    what_question_2 = "What " + sent.root.text + " " + join + "?"
                    questions.append((format_question(what_question_2), rel+gscore(what_question_2)))

            # Who questions
            subject = ["he", "she"]
            for i in range(0, len(sent)-1) :
                if sent[i].dep_ == "nsubj" and sent[i].ent_type_ == "PERSON" or sent[i].text in subject:
                    if i > 0:
                        start = Span(doc, sent.start, i+sent.start-1)
                    else:
                        start = Span(doc, sent.start, sent.start)
                    i = i+1
                    while i < len(sent)-1:
                        if sent[i].ent_type_ == "PERSON":
                            i = i+1
                        elif sent[i].dep_ == "nsubj":
                            i = i+1
                        else:
                            break
                    end = Span(doc, i + sent.start, sent.end-1)
                    if (len(start) == 0):
                        # print "Who " + end.text + "?"
                        who_question = "Who " + end.text + "?"
                        questions.append((format_question(who_question), rel+gscore(who_question)))
                    else:
                        # print start.text + " who " + end.text + "?"
                        who_question = start.text + " who " + end.text + "?"
                        questions.append((format_question(who_question), rel+gscore(who_question)))
                    break

            # When questions
            # Only works when original sentence is "In ____, blah blah."
            for i in range(0, len(sent)-1):
                if (sent[i].ent_type_ == "DATE" and sent[i].pos_ != "ADJ"):
                    hi = Span(doc, sent.start, i+sent.start)
                    head = sent[i].head
                    while i < len(sent) - 1 and (sent[i].ent_type_ == "DATE" or sent[i].pos_ == "PUNCT"):
                        i = i+1
                    end = Span(doc, i + sent.start, sent.end-1)
                    verb = sent[i]
                    for t in sent.root.lefts:
                        verb = t
                    if verb.lemma_ == "be":
                        final = "When was "
                    else:
                        final = "When did "
                    for token in end:
                        if verb.lemma_ == "be" and token.lemma_ == "be":
                            final = final
                        elif verb.lemma_ != "be" and token == sent.root:
                            final = final + sent.root.lemma_ + " "
                        else:
                            final = final + token.orth_ + " "
                    # print final[:-1] + "?"
                    when_question_1 = final[:-1] + "?"
                    questions.append((when_question_1, rel+gscore(when_question_1)))
                    when.append((when_question_1, rel+gscore(when_question_1)))
                    break

            for i in range(0, len(sent)-1):
                if (sent[i].ent_type_ == "DATE"):
                    if (i > 0):
                        front = Span(doc, sent.start, i+sent.start-1)
                    else:
                        font = []
                    valid = False
                    while i < len(sent) - 1 and (sent[i].ent_type_ == "DATE" or sent[i].pos_ == "PUNCT"):
                        i = i+1
                        valid = True
                    if valid:
                        end = Span(doc, sent.start + i, sent.end-1)
                        verb = sent[i]
                        for t in sent.root.lefts:
                            verb = t
                        if verb.lemma_ == "be":
                            final = "When was "
                        else:
                            final = "When did "
                        for token in front:
                            if token.ent_type_ == "":
                                final = final + token.lower_ + " "
                            else:
                                final = final + token.text + " "
                        for token in end:
                            if verb.lemma_ == "be" and token.lemma_ == "be":
                                final = final
                            elif verb.lemma_ != "be" and token == sent.root:
                                final = final + sent.root.lemma_ + " "
                            else:
                                final = final + token.orth_ + " "
                        # print final[:-1] + "?"
                        when_question_2 = final[:-1] + "?"
                        questions.append((when_question_2, rel+gscore(when_question_2)))
                        when.append((when_question_2, rel+gscore(when_question_2)))
                    break

            # Where questions
            for i in range(0, len(sent)-1):
                # if (sent[i].ent_type_ == "GPE" and sent[i-1].ent_type_ != "GPE"):
                if (sent[i].ent_type_ == "GPE" and sent[i-1].ent_type_ != "GPE" and sent[i-1].tag_ == "IN"):
                    # print sent
                    oneloc = str(sent[i])
                    j = i
                    while j < len(sent)-1 and sent[j+1].ent_type_ == "GPE":
                        oneloc += " " + str(sent[j+1])
                        j = j + 1
                    # possible_locations.append(oneloc)
                    i = j

                    for r in sent.root.rights:
                        if sent.root.lemma_ == "be":
                            final = "Where was "
                        elif sent.root.tag_ == "VB":
                            final = "Where do "
                        elif sent.root.tag_ == "VBP":
                            final = "Where have "
                        else:
                            final = "Where did "
                        subject = ''
                        for l in sent.root.lefts:
                            if l.right_edge.dep_=='nsubj':
                                subject = str(l.right_edge)
                                break
                            else:
                                subject =  ' '.join(w.text for w in l.subtree)
                        final += subject + " " + sent.root.lemma_ + " " + ' '.join(w.text for w  in r.subtree) + "?"
                        break
                    final = final.replace(oneloc, "")
                    where_question = final[:-1] + "?"
                    questions.append((where_question, rel+gscore(where_question)))
                    # print ""
                    break

        # DO/ DID DOES HAVE questions
            if (sent.root.pos_ == "VERB"):
                # print sent.root.tag_
                v_question = ""
                for r in sent.root.rights:

                    if sent.root.tag_ == 'VB':
                        v_question = 'Do'
                    elif sent.root.tag_ == 'VBD':
                        v_question = 'Did'
                    elif sent.root.tag_ == 'VBZ':
                        v_question = 'Does'
                    elif sent.root.tag_ == 'VBP':
                        v_question = 'Have'
                    elif sent.root.tag_ == 'VBN':
                        break
                    else:
                        break

                    subject =''
                    for l in sent.root.lefts:
                        if l.right_edge.dep_=='nsubj':
                            subject = str(l.right_edge)
                            break
                        elif v_question == 'Have':
                            subject =  ' '.join(w.text for w in l.subtree)
                            break
                        else:  subject =  ' '.join(w.text for w in l.subtree)
                    v_question += " " + subject + " " + sent.root.lemma_ + " " +' '.join(w.text for w in r.subtree) + '?'
                    break

                # print v_question
                questions.append((v_question, rel+gscore(v_question)))
                # print ""

        # Is Was Were Questions
            if (sent.root.lemma_ == "be"):
                for r in sent.root.rights:
                    if sent.root.text == '\'s':
                        isquest1 == 'Is'
                    else:
                        isquest1 = sent.root.text.capitalize()
                    for l in sent.root.lefts:
                        isquest1 += " " + ' '.join(w.text for w in l.subtree)
                    isquest1 += ' ' +' '.join(w.text for w in r.subtree) +'?'
                    break
                # print isquest1
                questions.append((isquest1, rel+gscore(isquest1)))
                # print ""

        questions = sorted(questions, key=lambda x: x[1], reverse=True)
        goodQuestions = []
        for q in questions:
            if q[0].count(' ') > 3:
                sentence = q[0]
                matches = tool.check(sentence)
                if len(matches) == 0:
                    goodQuestions.append(q[0])

        for i in range(0, qNum):
            if (i*4) < len(goodQuestions)
                print goodQuestions[i*4]