def extract_readability_scores(policy_folder): read = Readability() nlp.add_pipe(read, last=True) print ("Policy, Grade, Ease") for filename in os.listdir(policy_folder): if filename[0] != ".": with open(os.path.join(policy_folder, filename), 'r') as myfile: data = myfile.read() doc = nlp(data) print("{0}, {1}, {2}".format(filename,doc._.flesch_kincaid_grade_level,doc._.flesch_kincaid_reading_ease))
def __init__(self, data, spacy_model='en_core_web_lg', feature_array=None): """ Instantiates SimilarityVectorizer, loads spaCy model """ print("Initializing spaCy...") self.spacy_model = spacy_model self.nlp = spacy.load(spacy_model) self.read = Readability() self.nlp.add_pipe(self.read, last=True) if isinstance(data, dict): self.data_dict = data self.data = None else: self.data = data self.data_dict = self.data.data_dict self.feature_array = feature_array
def compute_readability(nlp, sentences): read = Readability() nlp.add_pipe(read, last=True) scores = [] if len(sentences) == 0: return 0.0 # Token.set_extension('context', default=False, force=True) for s in sentences: sent = nlp(s) avg_score = sent._.flesch_kincaid_grade_level + sent._.coleman_liau_index # sent._.automated_readability_index + \ if not avg_score: scores.append(0) else: scores.append(avg_score / 3) return np.mean(scores)
def __init__(self, config, max_workers=None, verbose=False): self.config = config self.available_languages = { 'en': 'en_core_web_md', 'de': 'de_core_news_sm', 'fr': 'fr_core_news_sm', 'es': 'es_core_news_sm', 'it': 'it_core_news_sm', 'multi': 'xx_ent_wiki_sm' } # we can use also BERT distance, but it's slower and does not support multi language # self.distance = BERT_distance() print("Preloading Word Embeddings for selected languages...") # list of the language we want to suppport dim = 200 vs = 200000 self.languages = config.languages # Checking for available languages for lang in self.languages: if lang not in self.available_languages: raise Exception( "Sorry, language '{}' not yet supported".format(lang)) self.verbose = verbose self.max_workers = max_workers self.transition = transitions_handler(self.config.transition_data_path) self.model_summarizer = { l: ModelSummarizer(config, lang=l, verbose=self.verbose) for l in self.languages } self.embedder = {l: SisterEmbedder(lang=l) for l in self.languages} self.nlp = { l: spacy.load(self.available_languages[l]) for l in self.languages } # Add Readability to nlp pipe for lang in self.nlp: read = Readability() nlp = self.get_nlp(lang) nlp.add_pipe(read, last=True)
def _calculate_readability(self, doc: Doc): """ Call the readability score functions """ assert doc.has_extension(STAGE.READABILITY) readability = Readability() scores = {"summary": {}, "text": {}} scores["text"]["dale_chall"] = readability.dale_chall(doc) scores["text"]["smog"] = readability.smog(doc) if self.summary_doc: scores["summary"]["dale_chall"] = readability.dale_chall( self.summary_doc) scores["summary"]["smog"] = readability.smog(self.summary_doc) return scores
import json import pandas as pd import spacy #import neuralcoref import pytextrank import networkx as nx from spacy_readability import Readability import os nlp = spacy.load('en_core_web_md') #neuralcoref.add_to_pipe(nlp) tr = pytextrank.TextRank() nlp.add_pipe(tr.PipelineComponent, name="textrank") read = Readability() nlp.add_pipe(read, last=True) pd.set_option('display.max_rows', 500) pd.set_option('display.max_columns', 500) pd.set_option('display.width', 1000) def parse(path): g = gzip.open(path, 'rb') for l in g: yield json.loads(l) def get_df(path): i = 0 df = {}
#pip install spacy #spacy.load("en_core_web_sm") #pip install spacy-readability #pip install spacy_readability import spacy from spacy_readability import Readability nlp = spacy.load('en_core_web_sm') read = Readability(nlp) y = Counter(([token.pos_ for token in nlp('The cat sat on the mat.')])) y['NOUN']/len([token.pos_ for token in nlp('The cat sat on the mat.')]) dfall = pd.DataFrame() df2 = pd.DataFrame() place = 0 for i in df['body']: i = str(i) y = Counter(([token.pos_ for token in nlp(i)])) noun = y['NOUN']/len([token.pos_ for token in nlp(i)])
def __init__(self, model='en_core_web_sm', sources_csv=None, wikifier_output_dir='', max_length=3000000): """Initialize the preprocessor.""" # Save wikifier option self.wikifier_output_dir = wikifier_output_dir # Load the language model # print('Preparing language model...') self.nlp = spacy.load(model) self.nlp.max_length = max_length # Import readability # print('Testing readability...') try: from spacy_readability import Readability self.collect_readability_scores = True except: msg = """The spacy-readability module is not installed on your system. Readability scores will be unavailable unless you `pip install spacy-_readability`.""" # print(msg) self.collect_readability_scores = False pass # Configure language model options self.add_stopwords = [] self.remove_stopwords = [] self.skip_entities = [ 'CARDINAL', 'DATE (except months)', 'QUANTITY', 'TIME' ] self.lemmatization_cases = { "humanities": [{ ORTH: u'humanities', LEMMA: u'humanities', POS: u'NOUN', TAG: u'NNS' }] } # Configure entity categories to be skipped when merging entities self.options = { 'merge_noun_chunks': False, 'merge_subtokens': False, 'skip_ents': self.skip_entities, 'collect_readability_scores': self.collect_readability_scores } # Handle lemmatisation exceptions for k, v in self.lemmatization_cases.items(): self.nlp.tokenizer.add_special_case(k, v) # Add and remove custom stop words - disabled for optimisation # for word in self.add_stopwords: # self.nlp.vocab[word].is_stop = True # for word in self.remove_stopwords: # self.nlp.vocab[word].is_stop = False self.nlp.add_pipe(self.skip_ents, after='ner') # Add readability to pipeline if self.collect_readability_scores == True: self.nlp.add_pipe(Readability()) # Load the sources file - disabled for optimisation self.sources = '' if sources_csv: with open(sources_csv, 'r') as f: self.sources = [dict(line) for line in csv.DictReader(f)]
class NLP(): nlp = spacy.load('en_core_web_sm') nlp.add_pipe(Readability(), last=True) matcher = Matcher(nlp.vocab) def __init__(self, text): self.doc = self.nlp(text) self.blob = TextBlob(self.doc.text) self.readability = self.readability_indexes() self.word_tokens = self.tokenize_words(self.doc) self.sents = list(self.doc.sents) self.polysyllables = self.get_polysyllables(self.word_tokens[1]) self.nominalized_words = self.get_nominalized(self.word_tokens[1]) self.pos = self.get_pos(self.doc) self.prepositional_phrases = self.get_pps(self.doc) self.passive_phrases = self.get_passive_phrases(self.doc) self.get_pronouns(self.doc) self.get_weak_verbs(self.doc) self.sentence_count = len(self.sents) self.statistics() self.word_count = len(self.word_tokens[1]) self.get_freq_dist() #self.lexicon_count = len(self.lexicon) self.get_intities() def readability_indexes(self): readability_scores = {} readability_scores['ari'] = self.doc._.automated_readability_index readability_scores['coleman_liau_index'] = self.doc._.coleman_liau_index readability_scores['dale_chall'] = self.doc._.dale_chall readability_scores['flesch_kincaid_grade'] = self.doc._.flesch_kincaid_grade_level readability_scores['flesch_kincaid_re'] = self.doc._.flesch_kincaid_reading_ease readability_scores['forcast'] = self.doc._.forcast readability_scores['smog'] = self.doc._.smog return readability_scores def tokenize_words(self, document): spacy_word_tokens = [t.text for t in document] no_punct_word_tokens = [] for w in spacy_word_tokens: for p in punctuation: w = w.replace(p, "").replace("\n", "").replace("", '') no_punct_word_tokens.append(w.lower()) no_punct_word_tokens.remove('') return (spacy_word_tokens, no_punct_word_tokens) def get_polysyllables(self, some_list): polysyllables = [] for w in some_list: if syllables.estimate(w) > 3: polysyllables.append(w) return polysyllables # def get_polysyllables2(self, doc): # phoney = BigPhoney() # self.total_syllables = phoney.count_syllables(self.doc.text) # self.polys = [] # for token in doc: # if phoney.count_syllables(token.text) > 3: # self.polys.append(token.text) # else: # pass def get_nominalized(self, list): nominalized_words = {} nominalized_words['-tion words'] = [] for word in list: if word.endswith("tion"): nominalized_words['-tion words'].append(word) else: pass return nominalized_words def get_pos(self, nlp_doc): parts_of_speech = {} parts_of_speech['gerunds'] = [] parts_of_speech['adjectives'] = [] parts_of_speech['adverbs'] = [] parts_of_speech['prepositions'] = [] for token in nlp_doc: if token.tag_ == "VBG": parts_of_speech['gerunds'].append(token.text) elif token.pos_ == "ADJ": parts_of_speech['adjectives'].append(token.text) elif token.pos_ == "ADV": parts_of_speech['adverbs'].append(token.text) else: pass return parts_of_speech def get_pps(self, doc): #Function to get prepositions from a parsed document. pps = [] for token in doc: if token.pos_ == 'ADP': pp = ' '.join([tok.orth_ for tok in token.subtree]) pps.append(pp) return pps def get_passive_phrases(self, doc): self.passive_sents = [] passive_phrases = [] passive_rule = [{'DEP': 'nsubjpass'}, {'DEP':'aux','OP':'*'}, {'DEP':'auxpass'}, {'TAG':'VBN'} ] self.matcher.add('passive', None, passive_rule) sents = list(doc.sents) matches = self.matcher(doc) for match_id, start, end in matches: string_id = doc.vocab.strings[match_id] span = doc[start:end] passive_phrases.append(span.text) for s in self.sents: for p in passive_phrases: if p in s.text: self.passive_sents.append(s.text) #return passive_phrases def get_weak_verbs(self, doc): self.weak_verbs = {} self.weak_verbs['to be'] = [] self.weak_verbs['auxiliary'] = [] for token in doc: if token.lemma_ == "be": self.weak_verbs['to be'].append(token.text) elif token.pos_ == 'AUX': self.weak_verbs['auxiliary'].append(token.text) else: pass def get_pronouns(self, doc): self.personal_pronouns = {} self.personal_pronouns['first person pronouns'] = [] self.personal_pronouns['second person pronouns'] = [] self.pronouns = [] for token in doc: if token.tag_ == 'PRP' or token.tag_ == "PRP$": if token.text.lower() in ['i', 'me', 'mine', 'my', 'myself']: self.personal_pronouns['first person pronouns'].append(token.text) elif token.text.lower() in ['you', 'your', 'yours', 'yourself']: self.personal_pronouns['second person pronouns'].append(token.text) else: pass elif token.pos_ == "PRON": self.pronouns.append(token.text.lower()) else: pass def statistics(self): self.statistics = {} self.statistics['per sentence'] = {} # rate per sentence self.statistics['per sentence'].update({'preposition rate':len(self.prepositional_phrases)/self.sentence_count}) self.statistics['per sentence'].update({'be rate':len(self.weak_verbs['to be'])/self.sentence_count}) self.statistics['per sentence'].update({'passive rate':len(self.passive_sents)/self.sentence_count}) self.statistics['percent of sentences'] = {} self.statistics['percent of sentences'].update({'prepositions':self.statistics['per sentence']['preposition rate'] * 100}) self.statistics['percent of sentences'].update({'to be':self.statistics['per sentence']['be rate'] * 100}) self.statistics['percent of sentences'].update({'passives':self.statistics['per sentence']['passive rate'] * 100}) self.statistics['ratios'] = {} self.statistics['ratios'].update({'adverbs to adjectives':len(self.pos['adverbs'])/len(self.pos['adjectives'])}) def get_freq_dist(self): words = [token.text for token in self.doc if token.is_stop != True and token.is_punct != True and token.text.isalpha() == True] nouns = [token.text for token in self.doc if token.is_stop != True and token.is_punct != True and token.pos_ == "NOUN" and token.text.isalpha() == True] verbs = [token.text for token in self.doc if token.is_stop != True and token.is_punct != True and token.pos_ == "VERB" and token.text.isalpha() == True] word_freq = Counter(words) noun_freq = Counter(nouns) verb_freq = Counter(verbs) self.common_words = word_freq.most_common(10) self.common_nouns = noun_freq.most_common(10) self.common_verbs = verb_freq.most_common(10) def get_intities(self): self.entities = {} for ent in self.doc.ents: self.entities[ent.text] = ent.label_
def transform(self, data): data = data[:] data = pd.DataFrame({'essay_id': data.index, 'essay': data.values}) # 纠正词法句法错误 tool = language_check.LanguageTool('en-US') data['matches'] = data['essay'].apply(lambda v: tool.check(v)) data['corrections_num'] = data.apply(lambda l: len(l['matches']), axis=1) data['corrected'] = data.apply( lambda l: language_check.correct(l['essay'], l['matches']), axis=1) # 分词,对其做词性标注,命名实体识别 tokens, sents, lemma, pos, ner, stop_words = [], [], [], [], [], STOP_WORDS flesch_kincaid_grade_level, flesch_kincaid_reading_ease, \ dale_chall, smog, coleman_liau_index, automated_readability_index, \ forcast = [], [], [], [], [], [], [] nlp = spacy.load('en_core_web_sm') nlp.add_pipe(Readability()) for essay in nlp.pipe(data['corrected'], batch_size=2, n_threads=2): if essay.is_parsed: tokens.append([e.text for e in essay]) sents.append([sent.string.strip() for sent in essay.sents]) pos.append([e.pos_ for e in essay]) ner.append([e.text for e in essay.ents]) lemma.append([n.lemma_ for n in essay]) flesch_kincaid_grade_level.append( essay._.flesch_kincaid_grade_level) flesch_kincaid_reading_ease.append( essay._.flesch_kincaid_reading_ease) dale_chall.append(essay._.dale_chall) smog.append(essay._.smog) coleman_liau_index.append(essay._.coleman_liau_index) automated_readability_index.append( essay._.automated_readability_index) forcast.append(essay._.forcast) else: tokens.append(None) sents.append(None) pos.append(None) ner.append(None) lemma.append(None) flesch_kincaid_grade_level.append(None) flesch_kincaid_reading_ease.append(None) dale_chall.append(None) smog.append(None) coleman_liau_index.append(None) automated_readability_index.append(None) forcast.append(None) # 词性标注,命名实体识别,词根化,分词断句 data['tokens'], data['sents'], data['lemma'], data['pos'], data[ 'ner'] = tokens, sents, lemma, pos, ner # 可读性特征 data['flesch_kincaid_grade_level'], data['flesch_kincaid_reading_ease'], data['dale_chall'], data['smog'], data[ 'coleman_liau_index'], data['automated_readability_index'], data['forcast'] = \ flesch_kincaid_grade_level, flesch_kincaid_reading_ease, dale_chall, smog, coleman_liau_index, automated_readability_index, forcast # 提取各种特征 data['token_count'] = data.apply(lambda x: len(x['tokens']), axis=1) data['unique_token_count'] = data.apply( lambda x: len(set(x['tokens'])), axis=1) data['type_token_ratio'] = data.apply( lambda x: x['unique_token_count'] / x['token_count'], axis=1) data['sent_count'] = data.apply(lambda x: len(x['sents']), axis=1) data['ner_count'] = data.apply(lambda x: len(x['ner']), axis=1) data['comma'] = data.apply(lambda x: x['corrected'].count(','), axis=1) data['quotation'] = data.apply( lambda x: x['corrected'].count('\'') + x['corrected'].count('\"'), axis=1) data['exclamation'] = data.apply(lambda x: x['corrected'].count('!'), axis=1) data['organization'] = data.apply( lambda x: x['corrected'].count(r'@ORGANIZATION'), axis=1) data['caps'] = data.apply(lambda x: x['corrected'].count(r'@CAPS'), axis=1) data['person'] = data.apply(lambda x: x['corrected'].count(r'@PERSON'), axis=1) data['location'] = data.apply( lambda x: x['corrected'].count(r'@LOCATION'), axis=1) data['money'] = data.apply(lambda x: x['corrected'].count(r'@MONEY'), axis=1) data['time'] = data.apply(lambda x: x['corrected'].count(r'@TIME'), axis=1) data['date'] = data.apply(lambda x: x['corrected'].count(r'@DATE'), axis=1) data['percent'] = data.apply( lambda x: x['corrected'].count(r'@PERCENT'), axis=1) data['at_num'] = data.apply(lambda x: x['corrected'].count(r'@NUM'), axis=1) data['noun'] = data.apply(lambda x: x['pos'].count('NOUN'), axis=1) data['adj'] = data.apply(lambda x: x['pos'].count('ADJ'), axis=1) data['pron'] = data.apply(lambda x: x['pos'].count('PRON'), axis=1) data['verb'] = data.apply(lambda x: x['pos'].count('VERB'), axis=1) data['noun'] = data.apply(lambda x: x['pos'].count('NOUN'), axis=1) data['cconj'] = data.apply(lambda x: x['pos'].count('CCONJ'), axis=1) data['sconj'] = data.apply(lambda x: x['pos'].count('SCONJ'), axis=1) data['adv'] = data.apply(lambda x: x['pos'].count('ADV'), axis=1) data['det'] = data.apply(lambda x: x['pos'].count('DET'), axis=1) data['propn'] = data.apply(lambda x: x['pos'].count('PROPN'), axis=1) data['num'] = data.apply(lambda x: x['pos'].count('NUM'), axis=1) data['part'] = data.apply(lambda x: x['pos'].count('PART'), axis=1) data['intj'] = data.apply(lambda x: x['pos'].count('INTJ'), axis=1) data['aux'] = data.apply(lambda x: x['pos'].count('AUX'), axis=1) data['adp'] = data.apply(lambda x: x['pos'].count('ADP'), axis=1) data['punct'] = data.apply(lambda x: x['pos'].count('PUNCT'), axis=1) data['formal'] = data.apply(style_features, axis=1) connective_words = self._read_connective_words() data['cohesion'] = data.apply(lambda x: sum( [1 if t in connective_words else 0 for t in x['tokens']]), axis=1) return data
def read(): np.random.seed(123) pipeline = spacy.load("en") return Readability(nlp=pipeline)
with doc.retokenize() as retokenizer: for ent in doc.ents: merge = True if ent.label_ in skip: merge = False if ent.label_ == 'DATE' and re.match(months, ent.text.lower()): merge = True if merge == True: attrs = {"tag": ent.root.tag, "dep": ent.root.dep, "ent_type": ent.label} retokenizer.merge(ent, attrs=attrs) return doc nlp.add_pipe(skip_ents, after='ner') # Test for the spacy-readability module if collect_readability_scores == True: nlp.add_pipe(Readability()) # Load the sources file with open('sources_csv', 'r') as f: sources = [dict(line) for line in csv.DictReader(f)] # The Document class class Document(): """Model a document's features. Parameters: - manifest_dir: the path to the manifest directory - manifest_file: the name of the manifest file. - content_property: the name of the property from which to extract the content Returns a dataframe.
def nlp(): np.random.seed(123) pipeline = spacy.load("en") pipeline.add_pipe(Readability(nlp=pipeline)) return pipeline
def nlp(): pipeline = spacy.load("en") pipeline.add_pipe(Readability()) return pipeline
def read(): return Readability()
def ProcessText(model: str, text: str): nlp = spacy.load(model) nlp.max_length = 3000000 nlp.add_pipe(Readability(), last=True) doc = nlp(text) return doc2json(doc, model)
def read_ger(): pipeline = spacy.load("de_core_news_sm") np.random.seed(123) return Readability(nlp=pipeline)