class RuleSentencizer(object): """ Simple component that correct some over-segmentation errors of the sentencizer using exception rules. Each rule must have a IS_SENT_START token pattern and this sentence boundary is removed from the final output. For example the text "Une indemnité de 100. 000 Frs" is by default segmented after the 100. but it shouldn't With this simple rule: [{"IS_DIGIT": True}, {"IS_SENT_START": True, "IS_PUNCT" : True}, {"IS_DIGIT": True}] The sentence corrector does the trick. The component is initialized this way: overrides = defaultdict(dict) overrides["rule_sentencizer"]["split"] = [ # Split on double line breaks [{"IS_SPACE": True, "TEXT": { "REGEX" : "[\n]{2,}" }}, {}], # Split on hard punctuation [{"ISPUNCT": True, "TEXT" : { "IN" : [".", "!", "?"]}}, {}] ] overrides["rule_sentencizer"]["join"] = [ # Une indemnité de 100. 000 Frs [{"IS_DIGIT": True}, {"IS_SENT_START": True, "IS_PUNCT" : True}, {"IS_DIGIT": True}] ] nlp = spacy.load(model) custom = RuleSentencizer(nlp, **overrides) nlp.add_pipe(custom) """ name = "rule_sentencizer" split_matcher = None join_matcher = None def __init__(self, nlp, **cfg): if self.name in cfg: split_patterns = cfg[self.name].get('split', None) if split_patterns: self.split_matcher = Matcher(nlp.vocab) self.split_matcher.add("split", None, *split_patterns) join_patterns = cfg[self.name].get('join', None) if join_patterns: self.join_matcher = Matcher(nlp.vocab) self.join_matcher.add("join", None, *join_patterns) def __call__(self, doc : Doc): save_parsed = doc.is_parsed doc.is_parsed = False if self.split_matcher: matches = self.split_matcher(doc) for match_id, start, end in matches: token = doc[end-1] token.is_sent_start = True if end-2>=0 and doc[end-2].is_sent_start is True: doc[end-2].is_sent_start = False if self.join_matcher: matches = self.join_matcher(doc) for match_id, start, end in matches: # If there is a sent start in the match, just remove it for token in doc[start:end]: if token.is_sent_start: token.is_sent_start = False doc.is_parsed = save_parsed if doc.is_sentenced else True return doc
def __init__(self, nlp, **cfg): if self.name in cfg: split_patterns = cfg[self.name].get('split', None) if split_patterns: self.split_matcher = Matcher(nlp.vocab) self.split_matcher.add("split", None, *split_patterns) join_patterns = cfg[self.name].get('join', None) if join_patterns: self.join_matcher = Matcher(nlp.vocab) self.join_matcher.add("join", None, *join_patterns)
class SentenceCorrector(object): """ Simple component that correct some over-segmentation errors of the sentencizer using exception rules. Each rule must have a IS_SENT_START token pattern and this sentence boundary is removed from the final output. For example the text "Une indemnité de 100. 000 Frs" is by default segmented after the 100. but it shouldn't With this simple rule: [{"IS_DIGIT": True}, {"IS_SENT_START": True, "IS_PUNCT" : True}, {"IS_DIGIT": True}] The sentence corrector does the trick. The component is initialized this way: overrides = defaultdict(dict) overrides["sentence_corrector"]["rules"] = [ # Une indemnité de 100. 000 Frs # Article 145-3 du code du commerce [{"IS_DIGIT": True}, {"IS_SENT_START": True, "IS_PUNCT" : True}, {"IS_DIGIT": True}], # Article L.145-3 du code du commerce [{"TEXT": {"REGEX": ".*[0-9]$"}}, {"IS_SENT_START": True, "IS_PUNCT": True}, {"IS_DIGIT": True}] ] nlp = spacy.load(model) custom = SentenceCorrector(nlp, **overrides) nlp.add_pipe(custom) """ name = "sentence_corrector" def __init__(self, nlp, **cfg): self.matcher = Matcher(nlp.vocab) if self.name in cfg: patterns = cfg[self.name]['rules'] self.matcher.add("SentenceCorrector", None, *patterns) def __call__(self, doc): matches = self.matcher(doc) if doc.is_parsed: doc.is_parsed = False for match_id, start, end in matches: # If there is a sent start in the match, just remove it for token in doc[start:end]: if token.is_sent_start: token.is_sent_start = False doc.is_parsed = True return doc
def analyse_file(path,filename,output_adverbs,output_adjectives): file_path = os.path.join(path,filename) if not (os.path.isfile(file_path)): logger.log(logging.ERROR,"File {0} is not a valid file".format(filename)) with open(file_path, 'r') as myfile: data = myfile.read() doc = nlp(data) adj_pattern = [{'POS': 'ADJ'}] adv_pattern = [{'POS': 'ADV'}] matcher = Matcher(nlp.vocab) matcher.add("Adjectives", None, adj_pattern) matcher.add("Adverbs", None, adv_pattern) matches = matcher(doc) adverbs = {} adjectives = {} for match_id, start, end in matches: string_id = nlp.vocab.strings[match_id] # Get string representation span = doc[start:end] # The matched span text_to_check = span.text.lower() if doc[start].pos_ =="ADV": if text_to_check in adverbs.keys(): adverbs[text_to_check] +=1 else: adverbs[text_to_check] = 1 elif doc[start].pos_ =="ADJ": if text_to_check in adjectives.keys(): adjectives[text_to_check] +=1 else: adjectives[text_to_check] = 1 with open(os.path.join(output_adverbs,"{0}_adv.txt".format(filename)),'w') as adverb_file: for key in adverbs: adverb_file.write("{0}: {1}\n".format(key,adverbs[key])) with open(os.path.join(output_adjectives, "{0}_adj.txt".format(filename)), 'w') as adjective_file: for key in adjectives: adjective_file.write("{0}: {1}\n".format(key, adjectives[key]))
def query_processing(query): # Split the string in quotation from the rest of the string str_split = shlex.split(query, posix=True) # Create string which we analyse the word classes and which word should be converted into one # ('worked' 'with' becomes 'worked with') new_string = "" # Dictionary over all positions of words in raw strings position = {} # If the element contains a quotation mark then it should not be a part of the new string for element in str_split: if "'" not in element: new_string += element + " " # Add to dictionary position[str_split.index(element)] = element # We analyse the query doc = nlp(new_string) # The pattern matcher matcher = Matcher(nlp.vocab) # We find the part of speech tags of the different words pos = extract_pos(matcher, doc) # Find words that should be seen together search_words = verb_adp(pos) # We re-merge the elements which are in quotation with the words we have found the tags of new_search_words = re_merge(position, search_words) # We find the proper predicates for a query predicates(new_search_words) # Comment this in to reverse the list so we can correctly bind things # Need to be commented out for the test # new_search_words.reverse() query = result(new_search_words) return query
def __init__(self, nlp, **cfg): self.matcher = Matcher(nlp.vocab) if self.name in cfg: patterns = cfg[self.name]['rules'] self.matcher.add("SentenceCorrector", None, *patterns)
def __init__(self): path = os.path.dirname(os.path.realpath(__file__)) self.df = pd.read_csv(os.path.join(path, "../data/countries.csv")) self.utils = nlpUtils() self.nlp = spacy.load("en_core_web_sm") self.nationality_matcher = Matcher(self.nlp.vocab) nat_pattern = list() nat_pattern.append([{ 'LEMMA': 'be' }, { 'POS': 'DET' }, { 'ENT_TYPE': { "IN": ["GPE", "NORP", "LANGUAGE"] }, 'OP': "*" }, { 'POS': { "IN": ["NOUN", "PROPN", "PUNCT", "ADJ", "SYM"] }, "OP": "*" }, { 'POS': { "IN": ["NOUN", "PROPN", "ADJ"] }, "OP": "+" }]) nat_pattern.append([{ 'LEMMA': 'be' }, { 'POS': 'DET' }, { 'ENT_TYPE': { "IN": ["GPE", "NORP", "LANGUAGE"] }, 'OP': "*" }, { "DEP": { "IN": ["punct", "compound", "amod", "nmod"] }, "OP": "*" }, { 'POS': 'NOUN' }, { "POS": { "IN": ["PUNCT", "NOUN", "ADJ", "PROPN"] }, "OP": "*" }, { 'ORTH': 'and' }, { 'POS': { "IN": ["NOUN", "PROPN", "PUNCT", "ADJ"] }, "OP": "*" }, { 'POS': { "IN": ["NOUN", "PROPN", "ADJ"] }, "OP": "+" }]) self.nationality_matcher.add("nationality", nat_pattern) self.influence_matcher = Matcher(self.nlp.vocab) influence1 = list() influence1.append([{ 'LEMMA': { "IN": ["inspire", "influence"] }, "POS": 'VERB' }, { 'ORTH': 'by' }, { "OP": "*" }]) self.influence_matcher.add("influence1", influence1) influence2 = list() influence2.append([{ 'LEMMA': { "IN": ["cite", "refer", "list", "mention", "credit", "claim"] }, "POS": 'VERB' }, { "OP": "*" }, { 'LEMMA': { "IN": ["as", "among"] } }, { "OP": "*" }, { 'LEMMA': 'influence', "POS": 'NOUN' }, { "OP": "*" }]) influence2.append([{ 'LEMMA': { "IN": ["cite", "refer", "list", "mention", "credit", "claim"] }, "POS": 'VERB' }, { "OP": "*" }, { 'LEMMA': 'be' }, { "OP": "*" }, { 'LEMMA': 'influence', "POS": 'NOUN' }]) self.influence_matcher.add("influence2", influence2) influence3 = list() influence3.append([{ 'LEMMA': 'influence', "POS": 'NOUN' }, { 'ORTH': 'include', "POS": 'VERB' }, { "OP": "*" }]) self.influence_matcher.add("influence3", influence3) influence4 = list() influence4.append([{ 'ORTH': 'influences', "POS": 'NOUN' }, { 'ORTH': 'cited' }, { 'ORTH': 'by' }, { "OP": "*" }, { 'ORTH': 'include', "POS": 'VERB' }, { "OP": "*" }]) self.influence_matcher.add("influence4", influence4) influence5 = list() influence5.append([{ 'LEMMA': 'cite', "POS": 'VERB' }, { 'ORTH': ',' }, { "ORTH": "as" }, { "OP": "*" }, { 'ORTH': 'influences', "POS": 'NOUN' }, { "OP": "*" }]) self.influence_matcher.add("influence5", influence5) influence6 = list() influence6.append([{ 'LEMMA': 'state', "POS": 'VERB' }, { "OP": "*" }, { 'LEMMA': 'influence', "POS": 'NOUN' }, { 'LEMMA': 'be' }, { "OP": "*" }]) self.influence_matcher.add("influence6", influence6) influence7 = list() influence7.append([{ 'ORTH': 'influences', "POS": 'NOUN' }, { "ORTH": "?" }, { "ORTH": "such" }, { "ORTH": "as" }, { "OP": "*" }]) self.influence_matcher.add("influence7", influence7) influence8 = list() influence8.append([{ 'LEMMA': { "IN": ["cite", "name"] }, "POS": "VERB" }, { "OP": "*" }, { "ORTH": "as" }, { "ORTH": "one" }, { "ORTH": "of" }, { "OP": "*" }, { "ORTH": "'s" }, { 'LEMMA': 'influence', "POS": 'NOUN' }]) self.influence_matcher.add("influence8", influence8) influence9 = list() influence9.append([{ 'LEMMA': 'influence', "POS": 'NOUN' }, { "ORTH": "including" }, { "OP": "*" }]) self.influence_matcher.add("influence9", influence9) influence10 = list() influence10.append([{ 'LEMMA': 'influence', "POS": 'NOUN' }, { "OP": "*" }, { "ORTH": "from" }, { "OP": "*" }]) self.influence_matcher.add("influence10", influence10) influence11 = list() influence11.append([{ 'ORTH': 'citing', "POS": 'VERB' }, { "ORTH": "as" }, { 'LEMMA': 'influence', "POS": 'NOUN' }, { "OP": "*" }]) self.influence_matcher.add("influence11", influence11) influence12 = list() influence12.append([{ 'LEMMA': 'influence', "POS": 'NOUN' }, { 'LEMMA': 'be' }, { "OP": "*" }]) self.influence_matcher.add("influence12", influence12) influence13 = list() influence13.append([{ 'LEMMA': 'influence', "POS": 'NOUN' }, { 'ORTH': 'of' }, { "OP": "*" }]) self.influence_matcher.add("influence13", influence13) influence14 = list() influence14.append([{ 'LEMMA': 'inspiration', "POS": 'NOUN' }, { 'ORTH': { "IN": ["from", "include"] } }, { "OP": "*" }]) influence14.append([{ 'LEMMA': 'cite', "POS": 'VERB' }, { "OP": "*" }, { "ORTH": "as" }, { 'LEMMA': 'inspiration', "POS": 'NOUN' }]) self.influence_matcher.add("influence14", influence14) self.mappa = dict() self.mappa[self.nlp.vocab.strings["influence1"]] = "influence1" self.mappa[self.nlp.vocab.strings["influence2"]] = "influence2" self.mappa[self.nlp.vocab.strings["influence3"]] = "influence3" self.mappa[self.nlp.vocab.strings["influence4"]] = "influence4" self.mappa[self.nlp.vocab.strings["influence5"]] = "influence5" self.mappa[self.nlp.vocab.strings["influence6"]] = "influence6" self.mappa[self.nlp.vocab.strings["influence7"]] = "influence7" self.mappa[self.nlp.vocab.strings["influence8"]] = "influence8" self.mappa[self.nlp.vocab.strings["influence9"]] = "influence9" self.mappa[self.nlp.vocab.strings["influence10"]] = "influence10" self.mappa[self.nlp.vocab.strings["influence11"]] = "influence11" self.mappa[self.nlp.vocab.strings["influence12"]] = "influence12" self.mappa[self.nlp.vocab.strings["influence13"]] = "influence13" self.mappa[self.nlp.vocab.strings["influence14"]] = "influence14"
class Relation_Extractor: def __init__(self): path = os.path.dirname(os.path.realpath(__file__)) self.df = pd.read_csv(os.path.join(path, "../data/countries.csv")) self.utils = nlpUtils() self.nlp = spacy.load("en_core_web_sm") self.nationality_matcher = Matcher(self.nlp.vocab) nat_pattern = list() nat_pattern.append([{ 'LEMMA': 'be' }, { 'POS': 'DET' }, { 'ENT_TYPE': { "IN": ["GPE", "NORP", "LANGUAGE"] }, 'OP': "*" }, { 'POS': { "IN": ["NOUN", "PROPN", "PUNCT", "ADJ", "SYM"] }, "OP": "*" }, { 'POS': { "IN": ["NOUN", "PROPN", "ADJ"] }, "OP": "+" }]) nat_pattern.append([{ 'LEMMA': 'be' }, { 'POS': 'DET' }, { 'ENT_TYPE': { "IN": ["GPE", "NORP", "LANGUAGE"] }, 'OP': "*" }, { "DEP": { "IN": ["punct", "compound", "amod", "nmod"] }, "OP": "*" }, { 'POS': 'NOUN' }, { "POS": { "IN": ["PUNCT", "NOUN", "ADJ", "PROPN"] }, "OP": "*" }, { 'ORTH': 'and' }, { 'POS': { "IN": ["NOUN", "PROPN", "PUNCT", "ADJ"] }, "OP": "*" }, { 'POS': { "IN": ["NOUN", "PROPN", "ADJ"] }, "OP": "+" }]) self.nationality_matcher.add("nationality", nat_pattern) self.influence_matcher = Matcher(self.nlp.vocab) influence1 = list() influence1.append([{ 'LEMMA': { "IN": ["inspire", "influence"] }, "POS": 'VERB' }, { 'ORTH': 'by' }, { "OP": "*" }]) self.influence_matcher.add("influence1", influence1) influence2 = list() influence2.append([{ 'LEMMA': { "IN": ["cite", "refer", "list", "mention", "credit", "claim"] }, "POS": 'VERB' }, { "OP": "*" }, { 'LEMMA': { "IN": ["as", "among"] } }, { "OP": "*" }, { 'LEMMA': 'influence', "POS": 'NOUN' }, { "OP": "*" }]) influence2.append([{ 'LEMMA': { "IN": ["cite", "refer", "list", "mention", "credit", "claim"] }, "POS": 'VERB' }, { "OP": "*" }, { 'LEMMA': 'be' }, { "OP": "*" }, { 'LEMMA': 'influence', "POS": 'NOUN' }]) self.influence_matcher.add("influence2", influence2) influence3 = list() influence3.append([{ 'LEMMA': 'influence', "POS": 'NOUN' }, { 'ORTH': 'include', "POS": 'VERB' }, { "OP": "*" }]) self.influence_matcher.add("influence3", influence3) influence4 = list() influence4.append([{ 'ORTH': 'influences', "POS": 'NOUN' }, { 'ORTH': 'cited' }, { 'ORTH': 'by' }, { "OP": "*" }, { 'ORTH': 'include', "POS": 'VERB' }, { "OP": "*" }]) self.influence_matcher.add("influence4", influence4) influence5 = list() influence5.append([{ 'LEMMA': 'cite', "POS": 'VERB' }, { 'ORTH': ',' }, { "ORTH": "as" }, { "OP": "*" }, { 'ORTH': 'influences', "POS": 'NOUN' }, { "OP": "*" }]) self.influence_matcher.add("influence5", influence5) influence6 = list() influence6.append([{ 'LEMMA': 'state', "POS": 'VERB' }, { "OP": "*" }, { 'LEMMA': 'influence', "POS": 'NOUN' }, { 'LEMMA': 'be' }, { "OP": "*" }]) self.influence_matcher.add("influence6", influence6) influence7 = list() influence7.append([{ 'ORTH': 'influences', "POS": 'NOUN' }, { "ORTH": "?" }, { "ORTH": "such" }, { "ORTH": "as" }, { "OP": "*" }]) self.influence_matcher.add("influence7", influence7) influence8 = list() influence8.append([{ 'LEMMA': { "IN": ["cite", "name"] }, "POS": "VERB" }, { "OP": "*" }, { "ORTH": "as" }, { "ORTH": "one" }, { "ORTH": "of" }, { "OP": "*" }, { "ORTH": "'s" }, { 'LEMMA': 'influence', "POS": 'NOUN' }]) self.influence_matcher.add("influence8", influence8) influence9 = list() influence9.append([{ 'LEMMA': 'influence', "POS": 'NOUN' }, { "ORTH": "including" }, { "OP": "*" }]) self.influence_matcher.add("influence9", influence9) influence10 = list() influence10.append([{ 'LEMMA': 'influence', "POS": 'NOUN' }, { "OP": "*" }, { "ORTH": "from" }, { "OP": "*" }]) self.influence_matcher.add("influence10", influence10) influence11 = list() influence11.append([{ 'ORTH': 'citing', "POS": 'VERB' }, { "ORTH": "as" }, { 'LEMMA': 'influence', "POS": 'NOUN' }, { "OP": "*" }]) self.influence_matcher.add("influence11", influence11) influence12 = list() influence12.append([{ 'LEMMA': 'influence', "POS": 'NOUN' }, { 'LEMMA': 'be' }, { "OP": "*" }]) self.influence_matcher.add("influence12", influence12) influence13 = list() influence13.append([{ 'LEMMA': 'influence', "POS": 'NOUN' }, { 'ORTH': 'of' }, { "OP": "*" }]) self.influence_matcher.add("influence13", influence13) influence14 = list() influence14.append([{ 'LEMMA': 'inspiration', "POS": 'NOUN' }, { 'ORTH': { "IN": ["from", "include"] } }, { "OP": "*" }]) influence14.append([{ 'LEMMA': 'cite', "POS": 'VERB' }, { "OP": "*" }, { "ORTH": "as" }, { 'LEMMA': 'inspiration', "POS": 'NOUN' }]) self.influence_matcher.add("influence14", influence14) self.mappa = dict() self.mappa[self.nlp.vocab.strings["influence1"]] = "influence1" self.mappa[self.nlp.vocab.strings["influence2"]] = "influence2" self.mappa[self.nlp.vocab.strings["influence3"]] = "influence3" self.mappa[self.nlp.vocab.strings["influence4"]] = "influence4" self.mappa[self.nlp.vocab.strings["influence5"]] = "influence5" self.mappa[self.nlp.vocab.strings["influence6"]] = "influence6" self.mappa[self.nlp.vocab.strings["influence7"]] = "influence7" self.mappa[self.nlp.vocab.strings["influence8"]] = "influence8" self.mappa[self.nlp.vocab.strings["influence9"]] = "influence9" self.mappa[self.nlp.vocab.strings["influence10"]] = "influence10" self.mappa[self.nlp.vocab.strings["influence11"]] = "influence11" self.mappa[self.nlp.vocab.strings["influence12"]] = "influence12" self.mappa[self.nlp.vocab.strings["influence13"]] = "influence13" self.mappa[self.nlp.vocab.strings["influence14"]] = "influence14" # takes a tuple (match, id) def get_countries_from_match(self, match): nationalities = list() for ent in match[0].ents: if ent.label_ in ["NORP", "GPE", "LANGUAGE"]: country = self.nationality_to_country(ent.text) if country is not None: nationalities.append(country) return nationalities # takes a tuple (match, id) def get_types_from_match(self, match): types = list() type = "" prev_tok = "" for tok in match[0]: if (tok.orth_ == "and" or tok.orth_ == ",") and type != "": types.append(type) type = "" if tok.ent_type_ not in [ "NORP", "GPE", "LANGUAGE" ] and tok.lemma_ != "be" and tok.pos_ != "DET" and tok.orth_ != "and" and tok.orth_ != ",": if type == "" or tok.text in ["-", "/" ] or prev_tok.text in ["-", "/"]: type += tok.text else: type += " " + tok.text prev_tok = tok if type != "": types.append(type) return types # takes a tuple (match, id) def get_influencers_from_match(self, match, connections): return self.get_artist_from_sentence(match[0].text, connections) def get_artist_from_sentence(selfself, sentence, connections): influencers = list() for connection in connections: if connection in sentence: influencers.append(connection) return influencers # take sentence (String) and relation try the match and return a coupe (span, id) where id is the pattern id which matches def match(self, sentence, relation): doc = self.utils.doc_from_text(sentence) if relation == "nationality": matches = self.nationality_matcher(doc) else: matches = self.influence_matcher(doc) lista_spans = list() for id, start, end in matches: span = doc[start:end] # The matched span lista_spans.append((span, id)) # clean matches and extract the first match since is often the significative one for nationality # for influence relation 1 match is enaugh text_list, span_list = self.utils.clean(lista_spans) indx = 1000 span_out = None for couple in span_list: span = couple[0] if span[0].i < indx: indx = span[0].i span_out = (span, couple[1]) return span_out #takes string and return string def nationality_to_country(self, nationality): res = self.df[self.df['nationality'].str.lower() == nationality.lower()].reset_index() if res.shape[0] > 0: return res.get_value(0, 'en_short_name') return None #Dovremmoe farlo con un dictionary. Scandiamo il csv/json riga/document alla volta, processiamo e creiamo un key value da mettere nel dict # We assume that types are commma separated. Nationality can be declared immediatly before types or in the form "from <Country>" def extract_naitionalityAndType(self, sentence): # x, matches = self.match("nationality_type", sentence) # la funzione deve andare bene per tutti i tipi di match! match = self.match( sentence, "nationality" ) # looking for string that matches both nationality and types in an adjacent way or just types if match is not None: self.utils.print_matches(sentence, [match[0].text]) countries = self.get_countries_from_match( match) # this kind of information is always in one single span types = self.get_types_from_match(match) print(countries) print(types) else: print(sentence) print("NO MATCHES") # if nationalities is empty: # nationality_matches = match("nationality2", sentence) # nationalities = extract a list of nationalities from nationalities_matches # return nationalities, types def extract_influencers(self, sentence, connections): match = self.match(sentence, "influencedBy") influencers = None if match is not None: # self.utils.print_matches(sentence, [match[0].text]) influencers = self.get_influencers_from_match(match, connections) # print(influencers) return influencers