def clean_text(document): stop_words_ = STOP_WORDS.union(stopwords.words('english')) stop_words = [unidecode(stop).lower() for stop in stop_words_] # Split to translate tokens = document.split() # Concatenate document = ' '.join(tokens) # Remove accents document = unidecode(document) # Remove https, mentions, special characters, single character document = re.sub( "(@[A-Za-z0-9]+)|(_[A-Za-z0-9]+)|(\w+:\/\/\S+)|(\W_)", " ", document).lower() # Remove pontuaction document = re.sub('[' + string.punctuation + ']', '', document) # Substituting multiple spaces with single space document = re.sub(r'\s+', ' ', document, flags=re.I) # Remove digits document = ''.join([i for i in document if not i.isdigit()]) # Remove all single characters document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document) # Split tokens = document.split() # Stopwords tokens = [w for w in tokens if w not in stop_words] # Concatenate preprocessed_text = ' '.join(tokens) return preprocessed_text
def _set_stopwords(self) -> 'KeywordRanking': stop_words = STOP_WORDS.union(self.stopwords) if self.stopwords else STOP_WORDS for word in stop_words: lexeme = nlp.vocab[word] lexeme.is_stop = True return self
def cluster_commonwords(texts, nwords=10, onlycorona="yes"): ignore_words = STOP_WORDS if onlycorona == "no" else STOP_WORDS.union( ['coronavirus', 'covid', 'covid19', 'covid-19']) allwords = [ w for w in ' '.join(texts.str.lower()).split() if w not in ignore_words and re.search('[a-z]', w) ] return ', '.join( [word for word, cnt in Counter(allwords).most_common(nwords)])
def set_stopwords(self, stopwords): """Set stop words""" if self.language == "en": for word in STOP_WORDS.union(set(stopwords)): lexeme = self.nlp.vocab[word] lexeme.is_stop = True elif self.language == "de": for word in STOP_WORDS_DE.union(set(stopwords)): lexeme = self.nlp.vocab[word] lexeme.is_stop = True
def transform_matrix(content_body, tokenize_lemma): html_stop_words = get_html_stop_words(content_body, tokenize_lemma) stop_words_lemma_train = set( tokenize_lemma(' '.join(STOP_WORDS.union(set(html_stop_words))))) X = content_body tfidf_vectorizer = TfidfVectorizer(max_features=300, stop_words=stop_words_lemma_train, tokenizer=tokenize_lemma) tfidf_vectorizer = tfidf_vectorizer.fit(X) tfidf_matrix = tfidf_vectorizer.transform(X) return tfidf_matrix
def remove_stopwords(content): custom_stopwords = ("feeling", "feel", "becaus", "want", "time", "realli", "im", "think", "thing", "ive", "still", "littl", "one", "life", "peopl", "need", "bit", "even", "much", "dont", "look", "way", "love", "start", "s", "m", "quot", "work", "get", "http", "go", "day", "com", "got", "see" "4pm", "<BIAS>", "veri", "know", "t", "like", "someth", "good", "going", "today", "u", "new", "cant", "people", "little", "pretty", "things") return hero.remove_stopwords(content, spacy_stop_words.union(custom_stopwords))
def scripts_to_tfidf(scripts): """Create Tfidf matrix from tokenized scripts.""" # custom stop words for scripts film_stop_words = ['V.O.', "Scene", "CUT TO", "FADE IN"] stop_words = STOP_WORDS.union(film_stop_words) # vectorize scripts into Tfidf matrix vectorizer = TfidfVectorizer(input='content', stop_words=stop_words, min_df=0.2, ngram_range=(1, 2)) # less than 20% frequency words are removed. bow = vectorizer.fit_transform(scripts) vocab = vectorizer.get_feature_names() return bow, vocab
def stage2(process_folder, label): path_stage1 = process_folder + label + 'stage1.json' from spacy.lang.en.stop_words import STOP_WORDS path_stage2 = process_folder + label + 'stage2.json' graph, ranks = text_rank(path_stage1) render_ranks(graph, ranks) with open(path_stage2, 'w') as f: for rl in normalize_key_phrases( path_stage1, ranks, stopwords=STOP_WORDS.union(STOP_WORDS)): f.write("%s\n" % pretty_print(rl._asdict()))
def get_baseline(): print('Spacy:', len(STOP_WORDS)) sw_sk = set(stop_words.ENGLISH_STOP_WORDS) print('sklearn', len(sw_sk)) sw = set( pd.read_csv( 'http://ir.dcs.gla.ac.uk/resources/linguistic_utils/stop_words', header=None, squeeze=True).tolist()) print('web', len(sw)) all = STOP_WORDS.union(sw).union(sw_sk) print('all', len(all)) pd.Series(sorted(list(all))).to_csv('baseline.csv', index=False)
import re import cloudpickle from spacy.lang.en import English from spacy.lang.en.stop_words import STOP_WORDS import os nlp = English() mystops = STOP_WORDS.union({ 'strain', 'strains', 'effect', 'effects', 'flavor', 'flavors', 'bud', 'buds', ' ', ' ', '$', 'user', 'users', 'produce', 'produces', 'showing', 'start', 'started', 'price', 'refers', 'packs', 'tends', 'stem', 'stems', 'report', 'supposedly', 'breed', 'bred', 'seed', 'seeds', 'intermittent', 'week', 'combine', 'combines', 'containing', '\xa0', 'smell', 'give', 'gives', 'explanation', 'call', 'calls', 'match', 'matches', 'making', 'tend', 'lineage', 'probably', 'especially', 'utilizing', 'offer', 'offers', 'technique', 'techniques', 'like', 'including' }) def token_str(s): if (type(s)) != str: return list() s = s.lower() s = re.sub(r'[\.,!?\\\-\$_]', ' ', s) s = re.sub(r' +', ' ', s) s = s.strip() if s == 'None' or s == '': return list() return [
def set_stopwords(self, stopwords): """Set stop words""" for word in STOP_WORDS.union(set(stopwords)): lexeme = nlp.vocab[word] lexeme.is_stop = True
def __init__(self, num_distinct_documents=5000, replace_entities=True, max_term_length=127, remove_stopwords=True, custom_stopwords=[ ',', '.', '-', '\xa0', '“', '”', '"', '\n', '—', ':', '?', 'I', '(', ')' ], analyze=False, document_tabe_name="documents", sentence_table_name="sentences", sentence_fields=OrderedDict({ "doc_id": "document_id", "sen_id": "sentence_id", "content": "sentence_text" }), term_table_name="terms", term_sql_format=("term_id", "term_text", "is_entity"), term_occurrence_table_name="term_occurrence", term_occurrence_sql_format=("document_id", "sentence_id", "term_id"), entity_table_name="entities", entity_sql_format=("entity_id", "entity_type"), database="postgres", user="******", password="******", host="127.0.0.1", port=5435, log_file=os.path.join(os.path.dirname(__file__), "logs/TermGenerator.log"), log_level=logging.INFO, log_verbose=True): """ Initializes various parameters, registers logger and MongoConnector, and sets up the limit. :param num_distinct_documents: (int) The number of distinct documents retrieved from the queries. For performance reasons, this should be limited during debugging/development. 0 (Zero) represents no limit, in accordance with the MongoDB standard for .limit(). :param replace_entities: (boolean) Whether or not the entities in the text should be replaced/recognised. The reason for this is that single terms might be merged together to one term, i.e. first and last name: "Dennis" "Aumiller" would be two separate terms in the traditional splitting (replace_entities=False), whereas - if set to true - "Dennis Aumiller" would represent only one entity. :param max_term_length: (int) Indicator of how long the terms are supposed to be (varchar property in table). :param remove_stopwords: (boolean) Determines whether or not stop words are removed. Currently, we are still deciding on the final set, but likely either one (or both) of NLTK and SpaCy's stop word lists. :param custom_stopwords: (list of strings) Additional words that will not be considered at adding-time. :param analyze: (boolean) Whether or not to include analytically relevant metrics. :param document_tabe_name: (str) Name of the table where the document information is stored. :param sentence_table_name: (str) Name of the table where the sentence information will be stored. :param sentence_fields: (OrderedDict) Structure of input to output values from MongoDB to postgres for the sentence table and its fields. :param term_table_name: (str) Name of the Postgres tables for the terms. :param term_sql_format: (tuple) Since those are generated locally, only a tuple of the PostgresColumns suffices. :param term_occurrence_table_name: (str) Name of the Postgres table for the term occurrences :param term_occurrence_sql_format: (tuple) Same as term_sql_format, but for the term occurrences. :param entity_table_name: (str) (Not implemented yet) Name of the table for the entity meta information. :param entity_sql_format: (str) Same as term_sql_format, but for entities. :param database: (str) database name. :param user: (str) User name to get access to the Postgres database. :param password: (str) Corresponding user password. :param host: (IP) IP address (in string format) for the host of the postgres database. :param port: (integer) Port at which to access the database. """ # set up logger self.logger = set_up_logger(__name__, log_file, log_level, log_verbose) self.logger.info("Successfully registered logger to TermGenerator.") # register a MongoConnector self.mc = MongoConnector() self.logger.info( "Successfully registered MongoConnector to TermGenerator.") # PostgresConnector self.pc = PostgresConnector(database, user, password, host, port) self.logger.info( "Successfully registered PostgresConnector to DocumentGenerator.") self.num_distinct_documents = num_distinct_documents # do this earlier since we need it already for the distinct documents. self.document_table_name = document_tabe_name # get the distinct IDs for the documents so we can match against them later # since we have removed parts of the document collection, we have to make sure to get this from Postgres. self.logger.info("Parsing relevant documents from Postgres...") with self.pc as open_pc: open_pc.cursor.execute("SELECT document_id FROM {}".format( self.document_table_name)) self.first_distinct_documents = list(open_pc.cursor.fetchall()) # extract from the tuple structure self.first_distinct_documents = [ el[0] for el in self.first_distinct_documents ] self.logger.info("Retrieved all relevant documents from Postgres.") # additionally restrict if we want only a number of documents. if self.num_distinct_documents != 0: self.logger.info( "Non-zero limit detected. Limiting to the first N entries.") self.first_distinct_documents = self.first_distinct_documents[:self . num_distinct_documents] self.replace_entities = replace_entities self.analyze = analyze self.max_term_length = max_term_length self.nlp = spacy.load("en") # construct dictionary with the entries per document/sentence id pair. Thus, we can later check whether # there are any entities in the current sentence with higher efficiency. self.occurrence_dict = {} self.occurring_entities = [] # start building the term dictionary/set, as well as an occurence map. Since terms will be "post-processed", # it is first created as a list and later cast to Counter and set. self.terms = [] # cast into a set later on. self.term_in_sentence = set() self.term_id = {} self.term_is_entity = {} if self.analyze: self.term_count = Counter() self.entity_count = Counter() self.entities = [] self.sentences = [] self.processed_sentences = [] # Postgres tables if not sentence_fields: self.logger.error("No sentence fields specified!") self.sentence_table_name = sentence_table_name self.sentence_fields = sentence_fields if not term_sql_format: self.logger.error("No term fields specified!") self.term_table_name = term_table_name self.term_sql_format = ", ".join(term_sql_format) if not term_occurrence_sql_format: self.logger.error("No term occurrence fields specified!") self.term_occurrence_table_name = term_occurrence_table_name self.term_occurrence_sql_format = ", ".join(term_occurrence_sql_format) if not entity_sql_format: self.logger.error("No entity fields specified!") self.entity_table_name = entity_table_name self.entity_sql_format = ", ".join(entity_sql_format) # value retrieving parse: self.sentence_values_to_retrieve = { key: 1 for key in self.sentence_fields.keys() } # suppress _id if not present: if "_id" not in self.sentence_values_to_retrieve.keys(): self.sentence_values_to_retrieve["_id"] = 0 self.sentence_sql_format = ", ".join( [value for value in self.sentence_fields.values()]) # create union of stop words, and add potentially custom stop words self.remove_stopwords = remove_stopwords self.removed_counter = 0 self.stopwords = STOP_WORDS.union(set(stopwords.words("english"))) # add custom stopwords. for word in custom_stopwords: self.stopwords.add(word) self.logger.info("Successfully initialized TermGenerator.")
from gensim.corpora import Dictionary from gensim.models import TfidfModel import re import string import spacy from spacy.lang.en.stop_words import STOP_WORDS import os import json from pathlib import Path nlp = spacy.load('en_core_web_lg') with open('stopwords.txt', 'r', encoding='utf-8') as f: STOPWORDS = f.readlines() STOPWORDS = set([item.strip(string.whitespace) for item in STOPWORDS]) STOP_WORDS = STOP_WORDS.union(STOPWORDS) # encodings: replace_dict = { '\ufb01': 'fi', '\u2019': '', '\u00e9': 'e', '\u00a8': '', 'ямБ': 'fi', } documents = [] # [ [token, token, token], [token, token, token], ...] fp = '../data/LRECjson/' for jsonfile in os.listdir(Path(fp)): #for jsonfile in ['../data/LRECjson/2018_1049.json']:
import numpy as np # from sklearn.externals.six import StringIO # from sklearn.tree import export_graphviz # import pydotplus # from IPython.display import Image from spellchecker import SpellChecker import pickle #using smote to deal imbalance #from imblearn.over_sampling import SMOTE # this symbol seems to have higher weightage in the final words when Naive Bayes is used, # so adding it to punctuations to filter punctuations = string.punctuation + "".join( ["...", "..........", "....", "--", "/"]) nlp = spacy.load("en_core_web_sm") STOP_WORDS = STOP_WORDS.union(CUSTOM_STOP_WORDS) #excluding NO from stopwords for our use #STOP_WORDS.discard("no") #STOP_WORDS.discard("not") #STOP_WORDS.discard("off") Urban_vocab = pd.read_csv("urbandict-word-def.csv") Urban_vocab = Urban_vocab["WORD"].tolist() # contraction_log = open("1_contractions.log", "w") # slang_log = open("1_slang.log", "w") out_vocab = open("1_o_vocab.log", "w") corpus_vocab = open("1_corpus_vocab.log", "w") parser = English() p.set_options(p.OPT.EMOJI, p.OPT.URL, p.OPT.SMILEY, p.OPT.NUMBER,
def get_stopwords(): custom_stopwords = [ 'tarun', 'tathak', '*****@*****.**', '\\r', '\\n' ] stopwords = STOP_WORDS.union(set(punctuation)).union(set(custom_stopwords)) return [x.lower() for x in list(stopwords)]
textrank = tr.TextRank() normalizer = lnormalizer.Lineal_Normalizer() pos_list = ['NOUN', 'PROPN', 'ADJ', 'VERB', 'INTJ'] stopword_list = [ "ANTECEDENTES", "ANTECEDENTE", "Antecedentes", "Antecedente", "OBJETIVOS", "OBJETIVO", "Objetivo", "Objetivos", "RESULTADOS", "RESULTADO", "Resultado", "Resultados", "MÉTODOS", "METODO", "Método", "Métodos", "CONCLUSIONES", "CONCLUSION", "Conclusiones", "Conclusion", "EVALUACIÓN", "evaluación" "ANTECEDENTES/OBJETIVO", "INTRODUCCIÓN", "Introduccion", "RESUMEN", "Resumen", "estudio", "año" ] """Set stop words""" for word in STOP_WORDS.union(set(stopword_list)): lexeme = nlp.vocab[word] lexeme.is_stop = True def get_tokens(text, entity_map={}, lower=False): doc = nlp(text) sentences = [] current_offset = 0 #general entities for entity in doc.ents: if (":" in entity.text): continue entity_map[entity.start_char] = entity.text.replace(" ", "_")
def make_alphabetic(text): """ A helper function to remove numbers and punctuation before passing the data to my preprocessing pipeline """ text = re.sub(r'[^A-Za-z\s]', '', text) return text.lower() nlp = spacy.load('en_core_web_sm', disable=['tagger', 'parser', 'ner'], n_process=-1) custom_stopwords = ['bra', ' bra', 'bra ', 'bras', 'sport', 'sports' 'a','aa', 'ab', 'b','c','cb','bc','d','dc','cd','dd','ddd', 'dddd', 'e', 'ee', 'f', 'ff', 'g', 'gg', 'h', 'hh', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'x', 's', 'm', 'xs', 'l', 'xl', 'xxl', 'lbs' , 'lb', '', ' ', ' ', '\n', '-PRON-', '\ufeff1' ] stopwords_list=STOP_WORDS.union(custom_stopwords) def lemmatize_pipe(doc): lemma_list = [str(tok.lemma_) for tok in doc if tok.is_alpha and tok.text and tok.lemma_ not in stopwords_list] lem_string = " ".join(lemma_list) return lem_string.lower() def preprocess_pipe(texts, batch_size=100): preproc_pipe = [] for doc in nlp.pipe(texts, batch_size=batch_size, n_threads=-1): preproc_pipe.append(lemmatize_pipe(doc)) return preproc_pipe
def my_stopwords(self, stopwords): for word in STOP_WORDS.union(set(stopwords)): lexeme = parser.vocab[word] lexeme.is_stop = True
def __init__(self): self.additional_stop_words = {"-PRON-"} self.stop_words = set(STOP_WORDS.union(self.additional_stop_words))
from collections import OrderedDict import numpy as np import spacy # NLP library that analyses text to extract keywords from spacy.lang.en.stop_words import STOP_WORDS from keyword_text_analyser.text_analyse_utils import sentence_segment, get_token_pairs, get_vocab, get_matrix nlp = spacy.load('en_core_web_sm') nlp.add_pipe(nlp.create_pipe('sentencizer'), first=True) custom_stopwords = ['use'] stopwords = STOP_WORDS.union(set(custom_stopwords)) window_size = 4 candidate_pos = ['NOUN', 'PROPN'] def set_stopwords(): """Set stop words""" for word in stopwords: lexeme = nlp.vocab[word] lexeme.is_stop = True class TextRank4Keyword: """Extract keywords from text """ def __init__(self): # Set stop words set_stopwords() self.d = 0.85 # damping coefficient, usually is .85 self.min_diff = 1e-5 # convergence threshold
def load_stop_words(stopwords=""): stop_words = [] for word in STOP_WORDS.union(set(stopwords.split(" "))): stop_words.append(word) return stop_words
def plot_wordcloud(df, name="unigram", labels=None, specific_label=None, sentiments=None, additional_stopwords=None, types=None, figsize=(15, 15), max_words=20, weights=None, make_title=True): """ This function creates wordclouds - when labels are passed and a specific label, only a wordcloud for this one cluster is created - when only labels are passed, a wordcloud for each cluster is created - when neither labels nor specific_label is passed a wordcloud for the complete df is created Note that the labels and the sentiments need not be of the length of the passed df, but can simply be corresponding to the original reviews :param df: the unigram dataframe :param name: the column name where the unigrams are stored :param labels: the cluster labels :param specific_labels: A specific label (if only a wordcloud for this cluster should be created) :param sentiments: sentiments belonging to the reviews :param additional_stopwords: A set (or list) containing additional stopwords that are not already in the spacy stopwords :param types: Specify a list with word-types that are allowed in the wordcloud see https://spacy.io/api/annotation; can be e.g. "NOUN" or "VERB" or "ADJ" :param figsize: the figsize :param max_words: the maximal number of words that are displayed in one wordcloud :weights: whether the term frequencies used for the wordcloud are weighted :param make_title: whether to include a title :return: figure """ if additional_stopwords is None: additional_stopwords = set() stopwords = STOP_WORDS.union(additional_stopwords) # wordcloud for whole df---------------------------------------------------- if labels is None and specific_label is None: term_frequency = get_term_frequency( df=df, name=name, types=types, additional_stopwords=additional_stopwords, weights=weights) wordcloud = WordCloud(background_color="white", stopwords=stopwords, max_words=max_words).generate_from_frequencies( frequencies=term_frequency) fig, axs = plt.subplots(figsize=figsize) title = f"n = {max(df['sentence_id'])}" if sentiments is not None: average_sentiment = np.round(np.mean(sentiments), 2) title = title + f", average sentiment = {average_sentiment}" if make_title: axs.set_title(title) axs.imshow(wordcloud) # remove x and y ticks axs.set_xticks([]) axs.set_yticks([]) return fig # wordcloud for a specific label-------------------------------------------- if specific_label is not None: recycled_labels = recycle(labels, df["sentence_id"]) term_frequency = get_term_frequency( df=df[recycled_labels == specific_label], name=name, types=types, additional_stopwords=additional_stopwords, weights=weights) wordcloud = WordCloud(background_color="white", stopwords=stopwords, max_words=max_words).generate_from_frequencies( frequencies=term_frequency) fig, axs = plt.subplots(figsize=figsize) cluster_size = sum(labels == specific_label) title = f"Wordcloud for cluster {specific_label}, n = {cluster_size}" if sentiments is not None: average_sentiment = round(np.mean([labels == specific_label])) title = title + f", average sentiment = {average_sentiment}" if make_title: axs.set_title(title) axs.imshow(wordcloud) # remove x and y ticks axs.set_xticks([]) axs.set_yticks([]) return fig # wordclouds for all clusters----------------------------------------------- # term frequency is a list with counter objects, one for each cluster term_frequency_list = get_term_frequency( df=df, name=name, types=types, labels=labels, additional_stopwords=additional_stopwords, weights=weights) def do_wc(term_frequency): wordcloud = WordCloud(background_color="white", max_words=max_words).generate_from_frequencies( frequencies=term_frequency) return wordcloud # create wordcloud for each cluster wordclouds = [do_wc(term_frequency_list[i]) for i in \ range(len(term_frequency_list))] label_list = list(set(labels)) n_clusters = len(label_list) ncol = 2 nrow = int(np.ceil(n_clusters / ncol)) fig, axs = plt.subplots(nrow, ncol, figsize=figsize) plt.subplots_adjust(hspace=0.3) for i in range(nrow): for j in range(ncol): if i * ncol + j + 1 <= n_clusters: # if n_clusters is uneven one subplot # is empty index = i * ncol + j label = label_list[index] title = f"label = {label}, " \ f"n = {sum(labels == label)}" if sentiments is not None: average_sentiment = round( np.mean(sentiments[labels == label]), 2) title = title + \ f", average_sentiment = {average_sentiment}" if make_title: axs[i, j].set_title(title) axs[i, j].imshow(wordclouds[index]) # remove x and y ticks axs[i, j].set_xticks([]) axs[i, j].set_yticks([]) else: fig.delaxes(axs[i, j]) # this is potentially the empty plot # if number of plots is uneven plt.tight_layout() plt.close() return fig
def get_term_frequency(df, name=None, types=None, labels=None, weights=None, additional_stopwords=None, sorted_df=False): """ This function creates a term frequency table for a dataframe that is created by get_ngram_df. Either one for the whole df is created (when labels is None) when one term frequency dict is created for each cluster, i.e. the return is a dict where output[k] contains the term-frequencies for label k. If weights are passed each objects is weighed according to it's weight and not as 1 :param df: a ngram-dataframe as obtained by the function get_ngram_df :param name: the name of the column in df that contains the ngrams e.g. "unigram", "bigram" or "unigram_stem", if kept at None it is assigned to "unigram", "bigram" or "ngram" if the name exists in df.columns :param types: df has a column "pos" that can be used to e.g. subset words. So only the data for which data["pos"] is in types are considered when creating the term frequencies. If None all types are accepted :param labels: cluster labels as numpy array or list :param weights: a numpy array or list that contains the weights, if None simple counting is done :param additional_stopwords: basic stopwords from spacy are removed by default. in some cases it is useful to remove further task-specific stopwords that can be passed as a set or list; None means no additional stopwords :param sorted_df: If true the term-frequencies are a sorted dataframe, otherwise they are a Counter object :return: the term frequencies """ # input checking------------------------------------------------------------ assert isinstance(df, pd.core.frame.DataFrame) assert name is None or isinstance(name, str) if name is not None: assert name in df.columns else: if "unigram" in df.columns: name = "unigram" elif "bigram" in df.columns: name = "bigram" elif "ngram" in df.columns: name = "ngram" else: raise Exception("name is none und was could not be identified") assert types is None or isinstance(types, (list, str, set)) if isinstance(types, str): types = [types] if isinstance(types, list): assert "pos" in df.columns assert labels is None or isinstance(labels, (list, np.ndarray)) # we have to ensure that weights and labels are numpy arrays because # if they remain a list we cannot subset via e.g. [2,5,7] if isinstance(labels, list): labels = np.array(labels) assert weights is None or isinstance(weights, (list, np.ndarray)) if isinstance(weights, list): weights = np.array(weights) if additional_stopwords is None: additional_stopwords = set() assert isinstance(additional_stopwords, (set, list, str)) if isinstance(additional_stopwords, str): additional_stopwords = [additional_stopwords] assert isinstance(sorted_df, bool) assert "sentence_id" in df.columns # recycle weights and labels if required------------------------------------ if labels is not None and len(labels) != len(df): labels = recycle(labels, df["sentence_id"]) if weights is not None and len(weights) != len(df): weights = recycle(weights, df["sentence_id"]) # subset relevant ngrams---------------------------------------------------- if name == "unigram": # in case of unigrams we include the standard stopwords from spacy stopwords = STOP_WORDS.union(additional_stopwords) else: stopwords = additional_stopwords relevant = df[name].apply(lambda x: x not in stopwords) if types is not None: relevant_type = df["pos"].apply(lambda x: x in types) relevant = relevant & relevant_type relevant_df = df[[name, "sentence_id"]][relevant] if labels is not None: labels = labels[relevant] if weights is not None: weights = weights[relevant] # calculate the term frequencies-------------------------------------------- if labels is None: return get_tf(ngrams=relevant_df[name], name=name, weights=weights, sorted_df=sorted_df) else: output = dict() current_weights = None for label in set(labels): # ATTENTION: # not that it is important here that we subset the dataframe # with a logical vector and not the indices, because then # one would have to pay attention to resetting the indices of the # pandas dataframe when constructing the relevant_df if weights is not None: current_weights = weights[labels == label] output[label] = get_tf(ngrams=relevant_df[name][labels == label], name=name, weights=current_weights, sorted_df=sorted_df) return output