Esempio n. 1
0
class RuleSentencizer(object):
    """
    Simple component that correct some over-segmentation errors of the sentencizer using exception rules.
    Each rule must have a IS_SENT_START token pattern and this sentence boundary is removed from the final output.
    For example the text
    "Une indemnité de 100. 000 Frs"
    is by default segmented after the 100. but it shouldn't
    With this simple rule:
    [{"IS_DIGIT": True}, {"IS_SENT_START": True, "IS_PUNCT" : True}, {"IS_DIGIT": True}]
    The sentence corrector does the trick.

    The component is initialized this way:
    overrides = defaultdict(dict)
    overrides["rule_sentencizer"]["split"] = [
        # Split on double line breaks
        [{"IS_SPACE": True, "TEXT": { "REGEX" : "[\n]{2,}" }}, {}],
        # Split on hard punctuation
        [{"ISPUNCT": True, "TEXT" : { "IN" : [".", "!", "?"]}}, {}]
    ]
    overrides["rule_sentencizer"]["join"] = [
        # Une indemnité de 100. 000 Frs
        [{"IS_DIGIT": True}, {"IS_SENT_START": True, "IS_PUNCT" : True}, {"IS_DIGIT": True}]
    ]
    nlp = spacy.load(model)
    custom = RuleSentencizer(nlp, **overrides)
    nlp.add_pipe(custom)
    """
    name = "rule_sentencizer"
    split_matcher = None
    join_matcher = None
    def __init__(self, nlp, **cfg):
        if self.name in cfg:
            split_patterns = cfg[self.name].get('split', None)
            if split_patterns:
                self.split_matcher = Matcher(nlp.vocab)
                self.split_matcher.add("split", None, *split_patterns)
            join_patterns = cfg[self.name].get('join', None)
            if join_patterns:
                self.join_matcher = Matcher(nlp.vocab)
                self.join_matcher.add("join", None, *join_patterns)

    def __call__(self, doc : Doc):
        save_parsed = doc.is_parsed
        doc.is_parsed = False
        if self.split_matcher:
            matches = self.split_matcher(doc)
            for match_id, start, end in matches:
                token = doc[end-1]
                token.is_sent_start = True
                if end-2>=0 and doc[end-2].is_sent_start is True:
                    doc[end-2].is_sent_start = False
        if self.join_matcher:
            matches = self.join_matcher(doc)
            for match_id, start, end in matches:
                # If there is a sent start in the match, just remove it
                for token in doc[start:end]:
                    if token.is_sent_start:
                        token.is_sent_start = False
        doc.is_parsed = save_parsed if doc.is_sentenced else True
        return doc
Esempio n. 2
0
 def __init__(self, nlp, **cfg):
     if self.name in cfg:
         split_patterns = cfg[self.name].get('split', None)
         if split_patterns:
             self.split_matcher = Matcher(nlp.vocab)
             self.split_matcher.add("split", None, *split_patterns)
         join_patterns = cfg[self.name].get('join', None)
         if join_patterns:
             self.join_matcher = Matcher(nlp.vocab)
             self.join_matcher.add("join", None, *join_patterns)
Esempio n. 3
0
class SentenceCorrector(object):
    """
    Simple component that correct some over-segmentation errors of the sentencizer using exception rules.
    Each rule must have a IS_SENT_START token pattern and this sentence boundary is removed from the final output.
    For example the text
    "Une indemnité de 100. 000 Frs"
    is by default segmented after the 100. but it shouldn't
    With this simple rule:
    [{"IS_DIGIT": True}, {"IS_SENT_START": True, "IS_PUNCT" : True}, {"IS_DIGIT": True}]
    The sentence corrector does the trick.

    The component is initialized this way:
    overrides = defaultdict(dict)
    overrides["sentence_corrector"]["rules"] = [
        # Une indemnité de 100. 000 Frs
        # Article 145-3 du code du commerce
        [{"IS_DIGIT": True}, {"IS_SENT_START": True, "IS_PUNCT" : True}, {"IS_DIGIT": True}],
        # Article L.145-3 du code du commerce
        [{"TEXT": {"REGEX": ".*[0-9]$"}}, {"IS_SENT_START": True, "IS_PUNCT": True}, {"IS_DIGIT": True}]
    ]
    nlp = spacy.load(model)
    custom = SentenceCorrector(nlp, **overrides)
    nlp.add_pipe(custom)
    """
    name = "sentence_corrector"

    def __init__(self, nlp, **cfg):
        self.matcher = Matcher(nlp.vocab)
        if self.name in cfg:
            patterns = cfg[self.name]['rules']
            self.matcher.add("SentenceCorrector", None, *patterns)

    def __call__(self, doc):
        matches = self.matcher(doc)
        if doc.is_parsed:
            doc.is_parsed = False
            for match_id, start, end in matches:
                # If there is a sent start in the match, just remove it
                for token in doc[start:end]:
                    if token.is_sent_start:
                        token.is_sent_start = False
            doc.is_parsed = True
        return doc
Esempio n. 4
0
def analyse_file(path,filename,output_adverbs,output_adjectives):
    file_path = os.path.join(path,filename)
    if not (os.path.isfile(file_path)):
        logger.log(logging.ERROR,"File {0} is not a valid file".format(filename))
    with open(file_path, 'r') as myfile:
        data = myfile.read()
        doc =  nlp(data)
        adj_pattern = [{'POS': 'ADJ'}]
        adv_pattern = [{'POS': 'ADV'}]
        matcher = Matcher(nlp.vocab)
        matcher.add("Adjectives", None, adj_pattern)
        matcher.add("Adverbs", None, adv_pattern)
        matches = matcher(doc)
        adverbs = {}
        adjectives = {}
        for match_id, start, end in matches:
            string_id = nlp.vocab.strings[match_id]  # Get string representation
            span = doc[start:end]  # The matched span
            text_to_check = span.text.lower()
            if doc[start].pos_ =="ADV":
                if text_to_check in adverbs.keys():
                    adverbs[text_to_check] +=1
                else:
                    adverbs[text_to_check] = 1
            elif doc[start].pos_ =="ADJ":
                if text_to_check in adjectives.keys():
                    adjectives[text_to_check] +=1
                else:
                    adjectives[text_to_check] = 1
        with open(os.path.join(output_adverbs,"{0}_adv.txt".format(filename)),'w') as adverb_file:
            for key in adverbs:
                adverb_file.write("{0}: {1}\n".format(key,adverbs[key]))
        with open(os.path.join(output_adjectives, "{0}_adj.txt".format(filename)), 'w') as adjective_file:
            for key in adjectives:
                adjective_file.write("{0}: {1}\n".format(key, adjectives[key]))
def query_processing(query):
    # Split the string in quotation from the rest of the string
    str_split = shlex.split(query, posix=True)

    # Create string which we analyse the word classes and which word should be converted into one
    # ('worked' 'with' becomes 'worked with')
    new_string = ""

    # Dictionary over all positions of words in raw strings
    position = {}

    # If the element contains a quotation mark then it should not be a part of the new string
    for element in str_split:
        if "'" not in element:
            new_string += element + " "

        # Add to dictionary
        position[str_split.index(element)] = element

    # We analyse the query
    doc = nlp(new_string)

    # The pattern matcher
    matcher = Matcher(nlp.vocab)

    # We find the part of speech tags of the different words
    pos = extract_pos(matcher, doc)

    # Find words that should be seen together
    search_words = verb_adp(pos)

    # We re-merge the elements which are in quotation with the words we have found the tags of
    new_search_words = re_merge(position, search_words)

    # We find the proper predicates for a query
    predicates(new_search_words)

    # Comment this in to reverse the list so we can correctly bind things
    # Need to be commented out for the test
    # new_search_words.reverse()

    query = result(new_search_words)

    return query
Esempio n. 6
0
 def __init__(self, nlp, **cfg):
     self.matcher = Matcher(nlp.vocab)
     if self.name in cfg:
         patterns = cfg[self.name]['rules']
         self.matcher.add("SentenceCorrector", None, *patterns)
Esempio n. 7
0
    def __init__(self):
        path = os.path.dirname(os.path.realpath(__file__))
        self.df = pd.read_csv(os.path.join(path, "../data/countries.csv"))
        self.utils = nlpUtils()
        self.nlp = spacy.load("en_core_web_sm")
        self.nationality_matcher = Matcher(self.nlp.vocab)
        nat_pattern = list()
        nat_pattern.append([{
            'LEMMA': 'be'
        }, {
            'POS': 'DET'
        }, {
            'ENT_TYPE': {
                "IN": ["GPE", "NORP", "LANGUAGE"]
            },
            'OP': "*"
        }, {
            'POS': {
                "IN": ["NOUN", "PROPN", "PUNCT", "ADJ", "SYM"]
            },
            "OP": "*"
        }, {
            'POS': {
                "IN": ["NOUN", "PROPN", "ADJ"]
            },
            "OP": "+"
        }])
        nat_pattern.append([{
            'LEMMA': 'be'
        }, {
            'POS': 'DET'
        }, {
            'ENT_TYPE': {
                "IN": ["GPE", "NORP", "LANGUAGE"]
            },
            'OP': "*"
        }, {
            "DEP": {
                "IN": ["punct", "compound", "amod", "nmod"]
            },
            "OP": "*"
        }, {
            'POS': 'NOUN'
        }, {
            "POS": {
                "IN": ["PUNCT", "NOUN", "ADJ", "PROPN"]
            },
            "OP": "*"
        }, {
            'ORTH': 'and'
        }, {
            'POS': {
                "IN": ["NOUN", "PROPN", "PUNCT", "ADJ"]
            },
            "OP": "*"
        }, {
            'POS': {
                "IN": ["NOUN", "PROPN", "ADJ"]
            },
            "OP": "+"
        }])

        self.nationality_matcher.add("nationality", nat_pattern)

        self.influence_matcher = Matcher(self.nlp.vocab)

        influence1 = list()
        influence1.append([{
            'LEMMA': {
                "IN": ["inspire", "influence"]
            },
            "POS": 'VERB'
        }, {
            'ORTH': 'by'
        }, {
            "OP": "*"
        }])
        self.influence_matcher.add("influence1", influence1)

        influence2 = list()
        influence2.append([{
            'LEMMA': {
                "IN": ["cite", "refer", "list", "mention", "credit", "claim"]
            },
            "POS": 'VERB'
        }, {
            "OP": "*"
        }, {
            'LEMMA': {
                "IN": ["as", "among"]
            }
        }, {
            "OP": "*"
        }, {
            'LEMMA': 'influence',
            "POS": 'NOUN'
        }, {
            "OP": "*"
        }])
        influence2.append([{
            'LEMMA': {
                "IN": ["cite", "refer", "list", "mention", "credit", "claim"]
            },
            "POS": 'VERB'
        }, {
            "OP": "*"
        }, {
            'LEMMA': 'be'
        }, {
            "OP": "*"
        }, {
            'LEMMA': 'influence',
            "POS": 'NOUN'
        }])
        self.influence_matcher.add("influence2", influence2)

        influence3 = list()
        influence3.append([{
            'LEMMA': 'influence',
            "POS": 'NOUN'
        }, {
            'ORTH': 'include',
            "POS": 'VERB'
        }, {
            "OP": "*"
        }])
        self.influence_matcher.add("influence3", influence3)

        influence4 = list()
        influence4.append([{
            'ORTH': 'influences',
            "POS": 'NOUN'
        }, {
            'ORTH': 'cited'
        }, {
            'ORTH': 'by'
        }, {
            "OP": "*"
        }, {
            'ORTH': 'include',
            "POS": 'VERB'
        }, {
            "OP": "*"
        }])
        self.influence_matcher.add("influence4", influence4)

        influence5 = list()
        influence5.append([{
            'LEMMA': 'cite',
            "POS": 'VERB'
        }, {
            'ORTH': ','
        }, {
            "ORTH": "as"
        }, {
            "OP": "*"
        }, {
            'ORTH': 'influences',
            "POS": 'NOUN'
        }, {
            "OP": "*"
        }])
        self.influence_matcher.add("influence5", influence5)

        influence6 = list()
        influence6.append([{
            'LEMMA': 'state',
            "POS": 'VERB'
        }, {
            "OP": "*"
        }, {
            'LEMMA': 'influence',
            "POS": 'NOUN'
        }, {
            'LEMMA': 'be'
        }, {
            "OP": "*"
        }])
        self.influence_matcher.add("influence6", influence6)

        influence7 = list()
        influence7.append([{
            'ORTH': 'influences',
            "POS": 'NOUN'
        }, {
            "ORTH": "?"
        }, {
            "ORTH": "such"
        }, {
            "ORTH": "as"
        }, {
            "OP": "*"
        }])
        self.influence_matcher.add("influence7", influence7)

        influence8 = list()
        influence8.append([{
            'LEMMA': {
                "IN": ["cite", "name"]
            },
            "POS": "VERB"
        }, {
            "OP": "*"
        }, {
            "ORTH": "as"
        }, {
            "ORTH": "one"
        }, {
            "ORTH": "of"
        }, {
            "OP": "*"
        }, {
            "ORTH": "'s"
        }, {
            'LEMMA': 'influence',
            "POS": 'NOUN'
        }])
        self.influence_matcher.add("influence8", influence8)

        influence9 = list()
        influence9.append([{
            'LEMMA': 'influence',
            "POS": 'NOUN'
        }, {
            "ORTH": "including"
        }, {
            "OP": "*"
        }])
        self.influence_matcher.add("influence9", influence9)

        influence10 = list()
        influence10.append([{
            'LEMMA': 'influence',
            "POS": 'NOUN'
        }, {
            "OP": "*"
        }, {
            "ORTH": "from"
        }, {
            "OP": "*"
        }])
        self.influence_matcher.add("influence10", influence10)

        influence11 = list()
        influence11.append([{
            'ORTH': 'citing',
            "POS": 'VERB'
        }, {
            "ORTH": "as"
        }, {
            'LEMMA': 'influence',
            "POS": 'NOUN'
        }, {
            "OP": "*"
        }])
        self.influence_matcher.add("influence11", influence11)

        influence12 = list()
        influence12.append([{
            'LEMMA': 'influence',
            "POS": 'NOUN'
        }, {
            'LEMMA': 'be'
        }, {
            "OP": "*"
        }])
        self.influence_matcher.add("influence12", influence12)

        influence13 = list()
        influence13.append([{
            'LEMMA': 'influence',
            "POS": 'NOUN'
        }, {
            'ORTH': 'of'
        }, {
            "OP": "*"
        }])
        self.influence_matcher.add("influence13", influence13)

        influence14 = list()
        influence14.append([{
            'LEMMA': 'inspiration',
            "POS": 'NOUN'
        }, {
            'ORTH': {
                "IN": ["from", "include"]
            }
        }, {
            "OP": "*"
        }])
        influence14.append([{
            'LEMMA': 'cite',
            "POS": 'VERB'
        }, {
            "OP": "*"
        }, {
            "ORTH": "as"
        }, {
            'LEMMA': 'inspiration',
            "POS": 'NOUN'
        }])
        self.influence_matcher.add("influence14", influence14)

        self.mappa = dict()
        self.mappa[self.nlp.vocab.strings["influence1"]] = "influence1"
        self.mappa[self.nlp.vocab.strings["influence2"]] = "influence2"
        self.mappa[self.nlp.vocab.strings["influence3"]] = "influence3"
        self.mappa[self.nlp.vocab.strings["influence4"]] = "influence4"
        self.mappa[self.nlp.vocab.strings["influence5"]] = "influence5"
        self.mappa[self.nlp.vocab.strings["influence6"]] = "influence6"
        self.mappa[self.nlp.vocab.strings["influence7"]] = "influence7"
        self.mappa[self.nlp.vocab.strings["influence8"]] = "influence8"
        self.mappa[self.nlp.vocab.strings["influence9"]] = "influence9"
        self.mappa[self.nlp.vocab.strings["influence10"]] = "influence10"
        self.mappa[self.nlp.vocab.strings["influence11"]] = "influence11"
        self.mappa[self.nlp.vocab.strings["influence12"]] = "influence12"
        self.mappa[self.nlp.vocab.strings["influence13"]] = "influence13"
        self.mappa[self.nlp.vocab.strings["influence14"]] = "influence14"
Esempio n. 8
0
class Relation_Extractor:
    def __init__(self):
        path = os.path.dirname(os.path.realpath(__file__))
        self.df = pd.read_csv(os.path.join(path, "../data/countries.csv"))
        self.utils = nlpUtils()
        self.nlp = spacy.load("en_core_web_sm")
        self.nationality_matcher = Matcher(self.nlp.vocab)
        nat_pattern = list()
        nat_pattern.append([{
            'LEMMA': 'be'
        }, {
            'POS': 'DET'
        }, {
            'ENT_TYPE': {
                "IN": ["GPE", "NORP", "LANGUAGE"]
            },
            'OP': "*"
        }, {
            'POS': {
                "IN": ["NOUN", "PROPN", "PUNCT", "ADJ", "SYM"]
            },
            "OP": "*"
        }, {
            'POS': {
                "IN": ["NOUN", "PROPN", "ADJ"]
            },
            "OP": "+"
        }])
        nat_pattern.append([{
            'LEMMA': 'be'
        }, {
            'POS': 'DET'
        }, {
            'ENT_TYPE': {
                "IN": ["GPE", "NORP", "LANGUAGE"]
            },
            'OP': "*"
        }, {
            "DEP": {
                "IN": ["punct", "compound", "amod", "nmod"]
            },
            "OP": "*"
        }, {
            'POS': 'NOUN'
        }, {
            "POS": {
                "IN": ["PUNCT", "NOUN", "ADJ", "PROPN"]
            },
            "OP": "*"
        }, {
            'ORTH': 'and'
        }, {
            'POS': {
                "IN": ["NOUN", "PROPN", "PUNCT", "ADJ"]
            },
            "OP": "*"
        }, {
            'POS': {
                "IN": ["NOUN", "PROPN", "ADJ"]
            },
            "OP": "+"
        }])

        self.nationality_matcher.add("nationality", nat_pattern)

        self.influence_matcher = Matcher(self.nlp.vocab)

        influence1 = list()
        influence1.append([{
            'LEMMA': {
                "IN": ["inspire", "influence"]
            },
            "POS": 'VERB'
        }, {
            'ORTH': 'by'
        }, {
            "OP": "*"
        }])
        self.influence_matcher.add("influence1", influence1)

        influence2 = list()
        influence2.append([{
            'LEMMA': {
                "IN": ["cite", "refer", "list", "mention", "credit", "claim"]
            },
            "POS": 'VERB'
        }, {
            "OP": "*"
        }, {
            'LEMMA': {
                "IN": ["as", "among"]
            }
        }, {
            "OP": "*"
        }, {
            'LEMMA': 'influence',
            "POS": 'NOUN'
        }, {
            "OP": "*"
        }])
        influence2.append([{
            'LEMMA': {
                "IN": ["cite", "refer", "list", "mention", "credit", "claim"]
            },
            "POS": 'VERB'
        }, {
            "OP": "*"
        }, {
            'LEMMA': 'be'
        }, {
            "OP": "*"
        }, {
            'LEMMA': 'influence',
            "POS": 'NOUN'
        }])
        self.influence_matcher.add("influence2", influence2)

        influence3 = list()
        influence3.append([{
            'LEMMA': 'influence',
            "POS": 'NOUN'
        }, {
            'ORTH': 'include',
            "POS": 'VERB'
        }, {
            "OP": "*"
        }])
        self.influence_matcher.add("influence3", influence3)

        influence4 = list()
        influence4.append([{
            'ORTH': 'influences',
            "POS": 'NOUN'
        }, {
            'ORTH': 'cited'
        }, {
            'ORTH': 'by'
        }, {
            "OP": "*"
        }, {
            'ORTH': 'include',
            "POS": 'VERB'
        }, {
            "OP": "*"
        }])
        self.influence_matcher.add("influence4", influence4)

        influence5 = list()
        influence5.append([{
            'LEMMA': 'cite',
            "POS": 'VERB'
        }, {
            'ORTH': ','
        }, {
            "ORTH": "as"
        }, {
            "OP": "*"
        }, {
            'ORTH': 'influences',
            "POS": 'NOUN'
        }, {
            "OP": "*"
        }])
        self.influence_matcher.add("influence5", influence5)

        influence6 = list()
        influence6.append([{
            'LEMMA': 'state',
            "POS": 'VERB'
        }, {
            "OP": "*"
        }, {
            'LEMMA': 'influence',
            "POS": 'NOUN'
        }, {
            'LEMMA': 'be'
        }, {
            "OP": "*"
        }])
        self.influence_matcher.add("influence6", influence6)

        influence7 = list()
        influence7.append([{
            'ORTH': 'influences',
            "POS": 'NOUN'
        }, {
            "ORTH": "?"
        }, {
            "ORTH": "such"
        }, {
            "ORTH": "as"
        }, {
            "OP": "*"
        }])
        self.influence_matcher.add("influence7", influence7)

        influence8 = list()
        influence8.append([{
            'LEMMA': {
                "IN": ["cite", "name"]
            },
            "POS": "VERB"
        }, {
            "OP": "*"
        }, {
            "ORTH": "as"
        }, {
            "ORTH": "one"
        }, {
            "ORTH": "of"
        }, {
            "OP": "*"
        }, {
            "ORTH": "'s"
        }, {
            'LEMMA': 'influence',
            "POS": 'NOUN'
        }])
        self.influence_matcher.add("influence8", influence8)

        influence9 = list()
        influence9.append([{
            'LEMMA': 'influence',
            "POS": 'NOUN'
        }, {
            "ORTH": "including"
        }, {
            "OP": "*"
        }])
        self.influence_matcher.add("influence9", influence9)

        influence10 = list()
        influence10.append([{
            'LEMMA': 'influence',
            "POS": 'NOUN'
        }, {
            "OP": "*"
        }, {
            "ORTH": "from"
        }, {
            "OP": "*"
        }])
        self.influence_matcher.add("influence10", influence10)

        influence11 = list()
        influence11.append([{
            'ORTH': 'citing',
            "POS": 'VERB'
        }, {
            "ORTH": "as"
        }, {
            'LEMMA': 'influence',
            "POS": 'NOUN'
        }, {
            "OP": "*"
        }])
        self.influence_matcher.add("influence11", influence11)

        influence12 = list()
        influence12.append([{
            'LEMMA': 'influence',
            "POS": 'NOUN'
        }, {
            'LEMMA': 'be'
        }, {
            "OP": "*"
        }])
        self.influence_matcher.add("influence12", influence12)

        influence13 = list()
        influence13.append([{
            'LEMMA': 'influence',
            "POS": 'NOUN'
        }, {
            'ORTH': 'of'
        }, {
            "OP": "*"
        }])
        self.influence_matcher.add("influence13", influence13)

        influence14 = list()
        influence14.append([{
            'LEMMA': 'inspiration',
            "POS": 'NOUN'
        }, {
            'ORTH': {
                "IN": ["from", "include"]
            }
        }, {
            "OP": "*"
        }])
        influence14.append([{
            'LEMMA': 'cite',
            "POS": 'VERB'
        }, {
            "OP": "*"
        }, {
            "ORTH": "as"
        }, {
            'LEMMA': 'inspiration',
            "POS": 'NOUN'
        }])
        self.influence_matcher.add("influence14", influence14)

        self.mappa = dict()
        self.mappa[self.nlp.vocab.strings["influence1"]] = "influence1"
        self.mappa[self.nlp.vocab.strings["influence2"]] = "influence2"
        self.mappa[self.nlp.vocab.strings["influence3"]] = "influence3"
        self.mappa[self.nlp.vocab.strings["influence4"]] = "influence4"
        self.mappa[self.nlp.vocab.strings["influence5"]] = "influence5"
        self.mappa[self.nlp.vocab.strings["influence6"]] = "influence6"
        self.mappa[self.nlp.vocab.strings["influence7"]] = "influence7"
        self.mappa[self.nlp.vocab.strings["influence8"]] = "influence8"
        self.mappa[self.nlp.vocab.strings["influence9"]] = "influence9"
        self.mappa[self.nlp.vocab.strings["influence10"]] = "influence10"
        self.mappa[self.nlp.vocab.strings["influence11"]] = "influence11"
        self.mappa[self.nlp.vocab.strings["influence12"]] = "influence12"
        self.mappa[self.nlp.vocab.strings["influence13"]] = "influence13"
        self.mappa[self.nlp.vocab.strings["influence14"]] = "influence14"

    # takes a tuple (match, id)
    def get_countries_from_match(self, match):
        nationalities = list()
        for ent in match[0].ents:
            if ent.label_ in ["NORP", "GPE", "LANGUAGE"]:
                country = self.nationality_to_country(ent.text)
                if country is not None:
                    nationalities.append(country)
        return nationalities

    # takes a tuple (match, id)
    def get_types_from_match(self, match):
        types = list()
        type = ""
        prev_tok = ""
        for tok in match[0]:
            if (tok.orth_ == "and" or tok.orth_ == ",") and type != "":
                types.append(type)
                type = ""
            if tok.ent_type_ not in [
                    "NORP", "GPE", "LANGUAGE"
            ] and tok.lemma_ != "be" and tok.pos_ != "DET" and tok.orth_ != "and" and tok.orth_ != ",":
                if type == "" or tok.text in ["-", "/"
                                              ] or prev_tok.text in ["-", "/"]:
                    type += tok.text
                else:
                    type += " " + tok.text
            prev_tok = tok
        if type != "":
            types.append(type)

        return types

    # takes a tuple (match, id)
    def get_influencers_from_match(self, match, connections):
        return self.get_artist_from_sentence(match[0].text, connections)

    def get_artist_from_sentence(selfself, sentence, connections):
        influencers = list()
        for connection in connections:
            if connection in sentence:
                influencers.append(connection)
        return influencers

    # take sentence (String) and relation try the match and return a coupe (span, id) where id is the pattern id which matches
    def match(self, sentence, relation):
        doc = self.utils.doc_from_text(sentence)
        if relation == "nationality":
            matches = self.nationality_matcher(doc)
        else:
            matches = self.influence_matcher(doc)

        lista_spans = list()
        for id, start, end in matches:
            span = doc[start:end]  # The matched span
            lista_spans.append((span, id))

        # clean matches and extract the first match since is often the significative one  for nationality
        # for influence relation 1 match is enaugh
        text_list, span_list = self.utils.clean(lista_spans)
        indx = 1000
        span_out = None
        for couple in span_list:
            span = couple[0]
            if span[0].i < indx:
                indx = span[0].i
                span_out = (span, couple[1])
        return span_out

    #takes string and return string
    def nationality_to_country(self, nationality):
        res = self.df[self.df['nationality'].str.lower() ==
                      nationality.lower()].reset_index()
        if res.shape[0] > 0:
            return res.get_value(0, 'en_short_name')
        return None

    #Dovremmoe farlo con un dictionary. Scandiamo il csv/json riga/document  alla volta, processiamo e creiamo un key value da mettere nel dict

    # We assume that types are commma separated. Nationality can be declared immediatly before types or in the form "from <Country>"
    def extract_naitionalityAndType(self, sentence):
        # x, matches = self.match("nationality_type", sentence) # la funzione deve andare bene per tutti i tipi di match!
        match = self.match(
            sentence, "nationality"
        )  # looking for string that matches both nationality and types in an adjacent way or just types
        if match is not None:
            self.utils.print_matches(sentence, [match[0].text])
            countries = self.get_countries_from_match(
                match)  # this kind of information is always in one single span
            types = self.get_types_from_match(match)
            print(countries)
            print(types)
        else:
            print(sentence)
            print("NO MATCHES")

    # if nationalities is empty:
    #    nationality_matches = match("nationality2", sentence)
    # nationalities = extract a list of nationalities from  nationalities_matches
    # return nationalities, types

    def extract_influencers(self, sentence, connections):
        match = self.match(sentence, "influencedBy")
        influencers = None
        if match is not None:
            # self.utils.print_matches(sentence, [match[0].text])
            influencers = self.get_influencers_from_match(match, connections)
        # print(influencers)
        return influencers