def simplify_compound_subjects(sentence_str): """Given a sentence doc, return a new sentence doc with compound subjects reduced to their simplest forms. 'The man, the boy, and the girl went to school.' would reduce to 'They went to school' 'The man, the boy, or the girls are frauds.' would reduce to 'The girls are frauds.' Sentences without a compund subject will not be changed at all.""" sentence_doc = textacy.Doc(sentence_str, lang='en_core_web_lg') cs_patterns = \ [r'((<DET>?(<NOUN|PROPN>|<PRON>)+<PUNCT>)+<DET>?(<NOUN|PROPN>|<PRON>)+<PUNCT>?<CCONJ><DET>?(<NOUN|PROPN>|<PRON>)+)|'\ '(<DET>?(<NOUN|PROPN>|<PRON>)<CCONJ><DET>?(<NOUN|PROPN>|<PRON>))'] for cs_pattern in cs_patterns: compound_subjects = textacy.extract.pos_regex_matches( sentence_doc, cs_pattern) chars_to_repl = [ ] # [(start_repl, end_repl, replacement), (start_repl, # end_repl, replacement), ...] for cs in compound_subjects: for w in cs: if w.pos_ == 'CCONJ' and w.text.lower() == 'and': # replace with they repl = 'they'.ljust(len(cs.text), '文') # pad w unexpected char chars_to_repl.append( [cs[0].idx, cs[-1].idx + len(cs[-1].text), repl]) elif w.pos_ == 'CCONJ' and w.text.lower() != 'and': # replace with final <DET>?(<NOUN|PROPN>|<PRON>) repl = cs[-1:].text if cs[-2].pos_ == 'DET': repl = cs[-2:].text repl = repl.ljust(len(cs.text), '文') # pad w unexpected char chars_to_repl.append( [cs[0].idx, cs[-1].idx + len(cs[-1].text), repl]) new_sent_str = sentence_doc.text for replacement in chars_to_repl: new_sent_str = new_sent_str[:replacement[0]] + replacement[2] + \ new_sent_str[replacement[1]:] new_sent_str = new_sent_str.replace('文', '') new_sent_str = re.sub('\s+', ' ', new_sent_str).strip() sentence_doc = textacy.Doc(new_sent_str, lang='en_core_web_lg') return sentence_doc
def headline_to_svo(text): text_str = ''.join(filter(lambda x: x in string.printable, text)) text_lower = text_str.lower() d1 = textacy.Doc(text_lower, lang=u"en") text_lower_str = str(text_lower) vs = textacy.extract.get_main_verbs_of_sent(d1) for v in vs: v_str = str(v) idx = text_lower_str.index(v_str) text_str = text_str[:idx] + v_str + text_str[idx + len(v):] d = textacy.Doc(text_str, lang=u"en") svo = textacy.extract.subject_verb_object_triples(d) return next(svo, None)
def kwd(sentence): try: txt_doc = textacy.Doc(sentence, lang="en_core_web_sm") kwds_sgrank = textacy.keyterms.sgrank(txt_doc, ngrams=(1,2), n_keyterms=3) kwds_sgrank = [kwd[0] for kwd in kwds_sgrank] kwds_sgrank_str = ', '.join(kwds_sgrank) kwds_textrank = textacy.keyterms.textrank(txt_doc) kwds_txtrnk = [kwd[0] for kwd in kwds_textrank] kw_uni = [] for kw in kwds_txtrnk: if kw not in kwds_sgrank_str: kw_uni.append(kw) kwds = kwds_sgrank + kw_uni l_kwds = [kwd for kwd in kwds] # remove numbers from keywords l_kwds = [kwd for kwd in l_kwds if not any(str(v).isdigit() for v in kwd)] # removing very small keywords (to reduce noise) l_kwds = [kwd for kwd in l_kwds if len(kwd)>3] kwds = l_kwds except: kwds = [] return kwds
def build_comp_termlist(): import os spacy_lang = en_core_web_sm.load() texts = [] for kag_path in glob.glob(KAG_BASE_PATH + '/*'): for comp_path in glob.glob(kag_path + '/*'): _, comp_name = os.path.split(comp_path) # print ('=Handling competence ' + comp_name) comp_text = '' for filename in glob.glob(comp_path + '/*.txt'): clean_text = preprocess_text(open(filename, 'r').read().decode('utf-8'), no_urls=True, no_emails=True, no_phone_numbers=True, no_numbers=True, no_currency_symbols=True, no_punct=True, no_contractions=True, no_accents=True) comp_text = ' '.join(text for text in (comp_text, clean_text)) # comp_text += open(filename, 'r').read().decode('utf-8') doc = textacy.Doc(comp_text, lang=spacy_lang) texts.append( doc.to_terms_list(named_entities=False, as_strings=True)) return texts
def extractNamedEntitiesAndCreateTextList(self, wordsWithoutStopWordsList): """ Extract named entities from a given text and create a list of all words in the article's text. In this case, the named entites will be stored as one item, such as [...,'Angela Merkel, 'said',...] :text: String :returns: List """ # transform text list to string, since removeStopWordsFromText() returns a list text = " ".join(w for w in wordsWithoutStopWordsList) # TODO: handle all used languages (en, es, de) doc = textacy.Doc(text, lang='en') entities = list(textacy.extract.named_entities(doc, exclude_types='numeric')) named_entities = [str(ent) for ent in entities] # replace named entites in text with 'tmpN' string for i, en in enumerate(named_entities): # Replace the first occuring named entity with the tmpN value. This is very important, # as there would be various tmp1 for example and the system wouldn't be able to change the values # in the next step. text = text.replace(en, 'tmp{}'.format(i), 1) # text string to list text = text.split() # change the 'tmpN' string in text list with named entity in entities list for j, ent in enumerate(named_entities): text[text.index('tmp{}'.format(j))] = ent # return the new text as list, that contains all words and named antities return text
def similaritytoselection(selection,corpus): #selection is any selected text string e.g. model answer or any response selected from student responses #returns a list of tuples in order from most to least similar to selected text (on a scale of 1 - 0) where tuple is (similarity, selectedtext) s = textacy.Doc(selection) simlist = [(textacy.similarity.word2vec(s,doc), doc.text) for doc in corpus] simlist.sort(key = lambda v:-v[0]) return simlist
def hola(request): datos = [] nlp = spacy.load("es_core_news_md") if request.method == "POST": text_box_value = request.POST['text_box'] print(text_box_value) contenido = text_box_value contenido = nlp(contenido) docs = textacy.Doc(contenido) sentencias = [s for s in docs.sents] print(len(sentencias)) tipos = set(ent.label_ for ent in contenido.ents) def cleanup(token, lower=True): if lower: token = token.lower() return token.strip() labels = set([w.label_ for w in contenido.ents]) personas = "" for label in labels: entities = [ cleanup(e.string, lower=False) for e in contenido.ents if label == e.label_ ] entities = list(set(entities)) if label == "PER": personas = entities print(label, entities) print("Esta son las entidades personas") return render(request, 'hola.html', {'datos': datos})
def find_phrases(self, sentence, stop_tokens): doc = nlp(sentence) doc_grams = [] unigrams = [] for i in doc.noun_chunks: text = " ".join( [t.lemma_ if t.lemma_ != "-PRON-" else t.text for t in i]) tokens = [ t for t in text.split() if t != "" and t not in stop_tokens ] unigrams.extend( list(filter(lambda word: self.is_valid_word(word), tokens))) grams = self.generate_ngrams(tokens, 3) grams.extend(self.generate_ngrams(tokens, 2)) for word in grams: if word not in stop_tokens: doc_grams.append(space_join(word)) pattern = r'<VERB>?<ADV>*<VERB>+' doc = textacy.Doc(sentence, lang=model) lists = textacy.extract.pos_regex_matches(doc, pattern) verbs_list = [] for l in lists: verb_tokens = l.lemma_.split() for verb in verb_tokens: if verb not in stop_tokens and self.is_valid_word(verb): verbs_list.append(verb) return doc_grams, unigrams, verbs_list
def keywords(): #print request.get_json() arg = request.get_json() doc = textacy.Doc(arg['content'], metadata={'title': arg['title']}, lang=unicode('en_core_web_sm')) sgrank_keywords = dict(keyterms.sgrank(doc)) singlerank_keywords = dict(keyterms.singlerank(doc)) textrank_keywords = dict(keyterms.textrank(doc)) sgrank_keywords.update((x, y * 0.9) for x, y in sgrank_keywords.items()) textrank_keywords.update( (x, y * 0.05) for x, y in textrank_keywords.items()) singlerank_keywords.update( (x, y * 0.05) for x, y in singlerank_keywords.items()) keywords = res = dict( Counter(sgrank_keywords) + Counter(textrank_keywords) + Counter(singlerank_keywords)) sorted_keywords = sorted(keywords.items(), key=operator.itemgetter(1), reverse=True) keyword_string = "" for i, key in enumerate(sorted_keywords): if (i == int(len(sorted_keywords) / 2)): keyword_string = keyword_string + "||" if (i == len(sorted_keywords) - 1 or i == int(len(sorted_keywords) / 2) - 1): keyword_string = keyword_string + key[0] else: keyword_string = keyword_string + key[0] + ",," return keyword_string
def __init__(self, infile): efc = ExtractFileContents() print("INIT: Extract contents for infile: ", infile) input_file_contents = efc.extractContents(infile) metadata = {'filename': infile} self.doc = textacy.Doc(input_file_contents, metadata=metadata)
def label_kwd(sentence): try: txt_doc = textacy.Doc(sentence, lang="en_core_web_sm") kwds_sgrank = textacy.keyterms.sgrank(txt_doc, ngrams=(1, 2)) kwds_sgrank = [kwd[0] for kwd in kwds_sgrank] kwds_sgrank_str = ', '.join(kwds_sgrank) kwds_textrank = textacy.keyterms.textrank(txt_doc) kwds_txtrnk = [kwd[0] for kwd in kwds_textrank] kw_uni = [] for kw in kwds_txtrnk: if kw not in kwds_sgrank_str: kw_uni.append(kw) kwds = kwds_sgrank + kwds_txtrnk l_kwds = [kwd for kwd in kwds] # remove numbers from keywords l_kwds = [ kwd for kwd in l_kwds if not any(str(v).isdigit() for v in kwd) ] # removing very small keywords (to reduce noise) l_kwds = [kwd for kwd in l_kwds if len(kwd) > 3] kwds = ','.join(l_kwds) final_data = sentence + '\t' + kwds + '\n' except: final_data = '' print(final_data) print() return final_data
def remove_adverbial_clauses(sentence_str): """Given a string, drop any adverbial clauses.""" # should also return updated indexes # Sam , worried, asked him. # [0, 0, 0, 0, 3, 3, 3] tdoc = textacy.Doc(sentence_str, lang='en_core_web_lg') advcl_phrases = [] #=> [(start.i, end.i), ...] has_advcl = False start = None for w in tdoc: if w.tag_ == ',' and has_advcl: # end phrase, start next if start: # end phrase if started advcl_phrases.append((start.i, w.i)) start = w has_advcl = False elif w.tag_ == ',': # start phrase start = w has_advcl = False if w.dep_ == 'advcl': has_advcl = True new_sent_str = sentence_str unusual_char = '形' for advcl in advcl_phrases: start = tdoc[advcl[0]].idx end = tdoc[advcl[1]].idx + len(tdoc[advcl[1]].text) sub = unusual_char * (end - start) new_sent_str = new_sent_str[:start] + sub + new_sent_str[end:] new_sent_str = new_sent_str.replace(unusual_char, '') return new_sent_str
def phrases(self, clean_text): all_lemmas = lemmatize(clean_text, stopwords=self.stopwords) curated_words = [str(word).split('/')[0] for word in all_lemmas] curated_text = ' '.join(curated_words) doc = textacy.Doc(curated_text, lang='en') all_phrases = [] all_phrases += textacy.extract.ngrams(doc, 2, filter_stops=True, filter_punct=True, filter_nums=True) all_phrases += textacy.extract.ngrams(doc, 3, filter_stops=True, filter_punct=True, filter_nums=True) all_phrases += textacy.extract.ngrams(doc, 4, filter_stops=True, filter_punct=True, filter_nums=True) all_phrases += textacy.extract.ngrams(doc, 5, filter_stops=True, filter_punct=True, filter_nums=True) phrases = [str(phrase) for phrase in all_phrases] return phrases
def substitute_infinitives_as_subjects(sent_str): """If an infinitive is used as a subject, substitute the gerund.""" sent_doc = textacy.Doc(sent_str, lang='en_core_web_lg') #inf_pattern = r'<PART><VERB>+' # To aux/auxpass* csubj inf_pattern = r'<PART><VERB>' # To aux/auxpass* csubj infinitives = textacy.extract.pos_regex_matches(sent_doc, inf_pattern) inf_subjs = [] # => [[0,1],...] for inf in infinitives: if inf[0].text.lower() != 'to': continue if ('csubj' not in [w.dep_ for w in inf] and sent_doc[inf[-1].i + 1].dep_ != 'csubj'): continue if inf[-1].tag_ != 'VB': continue inf_subj = [] for v in inf: inf_subj.append(v.i) inf_subjs.append(inf_subj) new_sent_str = sent_str unusual_char = '形' for inf_subj in inf_subjs: start_inf = sent_doc[inf_subj[0]].idx end_inf = sent_doc[inf_subj[-1]].idx + len(sent_doc[inf_subj[-1]]) inf_len = end_inf - start_inf sub = (unusual_char * inf_len) new_sent_str = new_sent_str[:start_inf] + sub + new_sent_str[end_inf:] new_sent_str = re.sub('形+', '{}', new_sent_str) repl = [ conjugate(sent_doc[i_s[-1]].text, tense='presentparticiple') for i_s in inf_subjs ] return new_sent_str.format(*repl)
def v_phrase_scores(sentence=None, phrase_type='verb'): if (phrase_type == 'verb'): pattern = r'<VERB>?<ADV>*<VERB>+' elif (phrase_type == 'noun'): pattern = r'<DET>? (<NOUN>+ <ADP|CONJ>)* <NOUN>+' else: print('Unregognized phrase type') return doc = textacy.Doc(sentence, lang='en_core_web_sm') lists = textacy.extract.pos_regex_matches(doc, pattern) tot_phrase_score = {} tot_phrase_score['pos'] = 0 tot_phrase_score['neg'] = 0 tot_phrase_score['neu'] = 0 tot_phrase_score['compound'] = 0 for list in lists: phrase_score = v_sentiment_scores(list.text) tot_phrase_score['pos'] += phrase_score['pos'] tot_phrase_score['neg'] += phrase_score['neg'] tot_phrase_score['neu'] += phrase_score['neu'] tot_phrase_score['compound'] += phrase_score['compound'] return tot_phrase_score
def word_stats(tree): """Returns a bunch of textacy stats on the text""" stats = None try: text = tree.find("mainText").text doc = textacy.Doc(text) stats = textacy.text_stats.TextStats(doc) except Exception as e: print(e) return stats
def setUp(self): self.spacy_lang = textacy.data.load_spacy('en') self.cw = textacy.datasets.CapitolWords() self.text = list(self.cw.texts(speaker_name={'Bernie Sanders'}, limit=1))[0] self.doc = textacy.Doc(self.text.strip(), lang=self.spacy_lang) records = self.cw.records(speaker_name={'Bernie Sanders'}, limit=10) text_stream, metadata_stream = textacy.fileio.split_record_fields( records, 'text') self.corpus = textacy.Corpus( self.spacy_lang, texts=text_stream, metadatas=metadata_stream)
def raise_infinitive_error(sentence_str): """Given a string, check that all infinitives are properly formatted""" sent_doc = textacy.Doc(sentence_str, lang='en_core_web_lg') inf_pattern = r'<PART|ADP><VERB>' # To aux/auxpass* csubj infinitives = textacy.extract.pos_regex_matches(sent_doc, inf_pattern) for inf in infinitives: if inf[0].text.lower() != 'to': continue if inf[-1].tag_ != 'VB': raise Exception('InfinitivePhraseError')
def get_new_doc(phrase, doc_type='textacy'): assert isinstance(phrase, basestring) assert isinstance(doc_type, str) assert doc_type in possible_docs, "Only {} doc types are supported".format( possible_docs) if doc_type == 'textacy': return textacy.Doc(phrase, lang=lang_en) elif doc_type == 'spacy': return nlp(phrase)
def wordTuples(graph, textEntry): text = rootify(graph, textEntry) pt = textacy.load_spacy('pt') doc = textacy.Doc(text, lang=pt) ts = textacy.TextStats(doc) words = [{ w[0]: w[1] } for w in textacy.keyterms.textrank( doc, normalize='lower', n_keyterms=ts.n_unique_words)] return words
def extractNamedEntities(query: str, language: str) -> list: """ Extract Phrases from a given query sentence string :query: String :language: String :returns: List """ doc = textacy.Doc(query, lang=language) entities = list(textacy.extract.named_entities(doc)) return [str(ent) for ent in entities]
def split_infinitive_warning(sentence_str): """Return a warning for a split infinitive, else, None""" sent_doc = textacy.Doc(sentence_str, lang='en_core_web_lg') inf_pattern = r'<PART><ADV><VERB>' # To aux/auxpass* csubj infinitives = textacy.extract.pos_regex_matches(sent_doc, inf_pattern) for inf in infinitives: if inf[0].text.lower() != 'to': continue if inf[-1].tag_ != 'VB': continue return 'SplitInfinitiveWarning'
def get_verb_chunks(sent): verb_chunks = [] pattern = r'<VERB>?<ADV>*<VERB>+' doct = textacy.Doc(sent, lang='en_core_web_sm') lists = textacy.extract.pos_regex_matches(doct, pattern) for list in lists: verb_chunks.append(list.text) # print(list.text) print(verb_chunks) return verb_chunks
def find_terms(content: str, maxterms=MAX_SUGGESTED_KEYTERMS, ngrams_to_extract=(1, 2, 3)) -> list: ''' Find NGRAMS of significance in passed text will look for ngrams up to 3 words by default. Returns a list of (term, frequency) tuples sorted by frequency ''' import textacy import textacy.keyterms import io from spacy.lang.en.stop_words import STOP_WORDS # if we used standard in, read the io.TextIOWrapper class # otherwise just accept as is if type(content) == io.TextIOWrapper: text = content.read() else: # its asumed to be str text = content LOG.debug(f'''content has %d chars''', len(text)) # find tags here tags = [] #lang_en = spacy.util.get_lang_class('en') doc = textacy.Doc(text) LOG.debug(doc) # extract keyterms to make suggestions for new ADASS subject terms keyterms = textacy.keyterms.textrank(doc, normalize='lemma', n_keyterms=maxterms) LOG.debug(f'''KEYTERMS: %s''', keyterms) # We'll use the Bag of terms, ngrams by frequency, to find relevant matches with # existing terms in the ADASS dictionary bot = doc.to_bag_of_terms(ngrams=ngrams_to_extract, lemmatize=True, named_entities=True, weighting='count', as_strings=True) # For some reason we see stopwords in the BoT, so make another pass to clean out stopwords # and the empty string then print top 15 number of terms by occurance cleaned_bot = [(term, cnt) for term, cnt in bot.items() if term not in STOP_WORDS and term != ''] sorted_cleaned_bot = sorted(cleaned_bot, key=lambda x: x[1], reverse=True) LOG.debug(f'''BAG of Terms (top, cleaned): %s''', sorted_cleaned_bot[:MAX_BOT_TERMS]) return {'ngrams': sorted_cleaned_bot, 'keyterms': keyterms}
def extract_keyphrases(self, algorithm, **kwargs): """ Method for extracting keyphrases from text algorithm takes 'str' object -> get function using eval 'func' object **kwargs: parameters for algorithm """ if isinstance(algorithm, str): algorithm = eval('tkt.{}'.format(algorithm)) doc = textacy.Doc(self.text, lang='en') self.keyphrases = list(algorithm(doc, **kwargs))
def drop_modifiers(sentence_str): """Given a string, drop the modifiers and return a string without them""" tdoc = textacy.Doc(sentence_str, lang='en_core_web_lg') new_sent = tdoc.text unusual_char = '形' for tag in tdoc: if tag.dep_.endswith('mod'): # Replace the tag new_sent = new_sent[:tag.idx] + unusual_char * len(tag.text) +\ new_sent[tag.idx + len(tag.text):] new_sent = new_sent.replace(unusual_char, '') new_sent = textacy.preprocess.normalize_whitespace(new_sent) return new_sent
def remove_prepositional_phrases(sentence_str): sentence_doc = textacy.Doc(sentence_str, lang='en_core_web_lg') # possessive pronouns labled as ADJ pp_pattern = r'<ADP><ADJ|DET>?(<NOUN>+<ADP>)*<NOUN>+' prep_phrases = textacy.extract.pos_regex_matches(sentence_doc, pp_pattern) new_sent_str = sentence_str unusual_char = '形' for pp in prep_phrases: sub = unusual_char * len(pp.text) new_sent_str = new_sent_str[:pp[0]. idx] + sub + new_sent_str[pp[0].idx + len(pp.text):] new_sent_str = new_sent_str.replace(unusual_char, '') return new_sent_str
def show_plot(self, pos, neg, plot_type='semantic'): for group in [('Positive', pos), ('Negative', neg)]: name, group = group if plot_type == 'semantic': # if name == 'Positive' and group.shape[0] > 150: # group = group.sample(155) corpus = [self.prep_tokens(tweet) for tweet in group] corpus = ' '.join(word for word in corpus) cleaned_text = textacy.preprocess_text(corpus, fix_unicode=True, no_accents=True) doc = textacy.Doc(cleaned_text, lang='en') graph = doc.to_semantic_network(nodes='words', edge_weighting='cooc_freq', window_width=10) drop_nodes = ['pron'] for node in drop_nodes: try: graph.remove_node(node) except: pass node_weights = nx.pagerank_scipy(graph) ax = textacy.viz.network.draw_semantic_network( graph, node_weights=node_weights, spread=50.0) plt.suptitle(name + ' Sentiment Topics:' + '\n{} {} tweets\n{}'.format( group.shape[0], name, self.hashtag)) # plt.savefig('../images/plots/' + name) else: corpus = [self.prep_tokens(tweet) for tweet in group] tf = TfidfVectorizer().fit(corpus) doc_term_matrix = tf.transform(corpus) vocab = tf.get_feature_names() vocab = [word for word in vocab if word != 'pron'] model = textacy.tm.TopicModel('nmf', n_topics=3) model.fit(doc_term_matrix) model.termite_plot(doc_term_matrix, vocab, topics=-1, n_terms=25, sort_terms_by='seriation', rank_terms_by='topic_weight', highlight_topics=range(3)) plt.suptitle(name + ' Sentiment Topics:') # plt.savefig('semantic_plot') plt.show(block=False)
def generate_corpus(comment_json): my_corpus = [] with open(comment_json) as input_file: article_comments = json.load(input_file) comments = list(article_comments.values())[0]['comments'] for cmt in comments: md = { 'comment_author': cmt['comment_author'], 'comment_time': cmt['comment_time'] } doc = textacy.Doc(cmt['comment'], metadata=md) my_corpus.append(doc) return my_corpus
def remove_prepositional_phrases(sentence_str): """Given a string, drop the prepositional phrases and return a new string without them""" sentence_doc = textacy.Doc(sentence_str, lang='en_core_web_lg') pp_pattern = r'<ADP><ADJ|DET>?(<NOUN>+<ADP>)*<NOUN>+' prep_phrases = textacy.extract.pos_regex_matches(sentence_doc, pp_pattern) new_sent_str = sentence_str unusual_char = '形' for pp in prep_phrases: sub = unusual_char * len(pp.text) new_sent_str = new_sent_str[:pp[0]. idx] + sub + new_sent_str[pp[0].idx + len(pp.text):] new_sent_str = new_sent_str.replace(unusual_char, '') return new_sent_str