def create_tfidf(oversample=False, description=False): print("Reading the data...") if oversample: df_train = get_oversampled_train() else: df_train = pd.read_csv("data/train_raw.csv") df_test = pd.read_csv("data/test_raw.csv") print("Creating the corpus...") corpus_train = textacy.Corpus(lang='en', texts=df_train['description'].tolist()) corpus_test = textacy.Corpus(lang='en', texts=df_test['description'].tolist()) tokenized_docs_train = (doc.to_terms_list(ngrams=1, named_entities=True, as_strings=True) for doc in corpus_train) tokenized_docs_test = (doc.to_terms_list(ngrams=1, named_entities=True, as_strings=True) for doc in corpus_test) print("Generating TF-IDF...") vectorizer = textacy.Vectorizer(apply_idf=True, norm="l2", min_df=4, max_df=.95) tfidf_train = vectorizer.fit_transform(tokenized_docs_train) tfidf_test = vectorizer.transform(tokenized_docs_test) tfidf_train = pd.DataFrame(tfidf_train.toarray()) tfidf_test = pd.DataFrame(tfidf_test.toarray()) if description: pd.concat([tfidf_train, df_train['label']], axis=1).to_csv("data/tfidf_train_description.csv", index=False) pd.concat([tfidf_test, df_test['label']], axis=1).to_csv("data/tfidf_test_description.csv", index=False) else: pd.concat([tfidf_train, df_train['label']], axis=1).to_csv("data/tfidf_train.csv", index=False) pd.concat([tfidf_test, df_test['label']], axis=1).to_csv("data/tfidf_test.csv", index=False)
def create_textacy_corpus(corpus_reader, nlp, tick=utility.noop, strip_tensor=True): logger.info('creating corpus (this might take some time)...') batch_size = 100 corpus = textacy.Corpus(nlp) document_id = 0 n_chunk_threshold = 50000 for filename, text, metadata in corpus_reader: metadata = utility.extend( metadata, dict(filename=filename, document_id=document_id)) if len(text) > n_chunk_threshold: spacy_doc = textacy.spacier.utils.make_doc_from_text_chunks( text, lang=nlp, chunk_size=n_chunk_threshold) corpus.add_doc(spacy_doc, metadata) else: corpus.add_text(text, metadata) if strip_tensor: for doc in corpus: doc.spacy_doc.tensor = None document_id += 1 if document_id % batch_size == 0: logger.info('%s documents added...', document_id) tick(document_id) return corpus
def create_corpus(lang="en_core_web_lg"): # nlp = en # component = entities.FinancialEntityRecognizer(nlp, entitites._financial_institutions) # initialise component # en.add_pipe(component, before="ner") bpd = BlockchainPapersDataset() corpus = textacy.Corpus(lang, data=bpd.records()) return corpus
def create_textacy_corpus(corpus_reader, nlp, tick=utility.noop, n_chunk_threshold=100000): corpus = textacy.Corpus(nlp) counter = 0 for filename, document_id, text, metadata in corpus_reader: metadata = utility.extend( metadata, dict(filename=filename, document_id=document_id)) if len(text) > n_chunk_threshold: doc = textacy.spacier.utils.make_doc_from_text_chunks( text, lang=nlp, chunk_size=n_chunk_threshold) corpus.add_doc(doc) doc._.meta = metadata else: corpus.add((text, metadata)) counter += 1 if counter % 100 == 0: logger.info('%s documents added...', counter) tick(counter) logger.info('Done! %s documents added!', counter) return corpus
def exec_pipeline(self, texts, pipeline_components, normalize_texts=True): """ Starts the NLP pipeline (https://miro.medium.com/max/700/1*tRJU9bFckl0uG5_wTR8Tsw.png) of spaCy defined in the constructor for a corpus. Parameters ---------- texts : List[Dict] Expects a list of document dicts with their meta data pipeline_components : List[str] List of analysis types to consider for this pipeline normalize : bool, optional whether to clean the texts before processing, by default True (should only be false for debugging purposes) """ self.corpus = textacy.Corpus(self.nlp) with self.nlp.disable_pipes(*self._remove_unused_components(pipeline_components)): if (self.threads == -1): if (platform.startswith('win32')): partitions = minibatch(texts, math.ceil(len(texts) / cpu_count())) else: from os import sched_getaffinity partitions = minibatch(texts, math.ceil(len(texts) / len(sched_getaffinity(0)))) else: partitions = minibatch(texts, math.ceil(len(texts) / self.threads)) executor = Parallel(n_jobs=self.threads, require="sharedmem", prefer="threads", verbose=10) do = delayed(partial(self._exec_pipeline_for_sub_corpus, normalize_texts)) tasks = (do(i, batch) for i, batch in enumerate(partitions)) sub_corpora = executor(tasks) self.corpus.add_docs([doc for corpus in sub_corpora for doc in corpus])
def create_textacy_corpus(documents, nlp, tick=utility.noop): corpus = textacy.Corpus(nlp) for filename, text, metadata in documents: corpus.add_text(text, utility.extend(dict(filename=filename), metadata)) tick() return corpus
def txt_to_corpus(txt_dir, lang=en, txt_extention=".txt"): """ Reads a text directory and puts in a textacy corpus with the filename as metadata and adds a docstats_df """ # Get the name of the function - should be decorator for every function functionNameAsString = sys._getframe().f_code.co_name logging.debug( f"Function: {functionNameAsString} -- Loading Text from: {txt_dir}") # Get a list of files to get text for flpth_gen = textacy.io.utils.get_filepaths(txt_dir, match_regex=None, ignore_regex=None, extension=".txt", ignore_invisible=True, recursive=True) # Loop throuh the text directory (input), for all the files ending with .txt #docs_lst = [dask.delayed(txtfile_to_doc)(flpth, en) for flpth in flpth_gen] rec_lst = [txt_to_docrec(flpth) for flpth in flpth_gen] #docs = dask.compute(docs_lst) # Add docs to a spacy corpus crps = textacy.Corpus(en, rec_lst) # Calculate stats for each doc in a corpus and make a docstats_df crpsStats = ptg.corpus_stats.CorpusStats(crps) crps.docstats_df = crpsStats.docstats_df logging.debug(f"Function: {functionNameAsString} -- Loaded {crps.n_docs}") return crps
def get_topics(speeches, path_to_save, n_topics=10, n_words=10): ''' INPUT: List of cleaned speeches OUTPUT: Top n_words for n_topics ''' corpus = textacy.Corpus(('en').decode('ascii', 'ignore'), texts=speeches) doc_term_matrix, id2term = textacy.vsm.doc_term_matrix((doc.to_terms_list(ngrams=1, named_entities=False, as_strings=True) for doc in corpus), weighting='tfidf', normalize=True, smooth_idf=True, min_df=2, max_df=0.95) model = textacy.tm.TopicModel('nmf', n_topics=n_topics) model.fit(doc_term_matrix) doc_topic_matrix = model.transform(doc_term_matrix) doc_topic_matrix.shape topic_dic = {} for topic_idx, top_terms in model.top_topic_terms(id2term, top_n=n_words): topic_dic['Topic' + ' ' + str(topic_idx)] = top_terms model.termite_plot(doc_term_matrix, id2term, topics=-1, n_terms=25, highlight_topics = [2, 3, 4, 5, 8, 9], sort_terms_by='seriation', save=path_to_save) return topic_dic
def create_corpus(self, fpath): """ Load csv file /code/data_all.csv from disk. Each row is one document. It expects first column to be the Text / Body we want to analyse with textacy. The rest of the columns are stored as metadata associated to each document. Returns a textacy.Corpus. """ # read all eea documents from csv file eeadocs = textacy.fileio.read.read_csv(fpath) # '/code/data_all.csv' # use title as "text" to analyse. # therefore split title (first column 0) from metadata content_stream, metadata_stream = textacy.fileio.split_record_fields( eeadocs, 0) # create textacy english Corpus corpus = textacy.Corpus('en', texts=content_stream, metadatas=metadata_stream) return corpus
def load_corpus(filename, lang, document_id='document_id', format='binary'): if format == 'binary': '''HACK: read docs saved in 'binary' format. NOTICE: textacy patch''' docs = textacy_patch.read_spacy_docs(filename, format=format, lang=lang) corpus = textacy.Corpus(docs=docs, lang=lang) #spacy_docs = textacy.io.read_spacy_docs(filename, format=format, lang=lang) #first_spacy_doc, spacy_docs = itertoolz.peek(spacy_docs) #spacy_lang_meta = first_spacy_doc.user_data['textacy'].pop('spacy_lang_meta') #spacy_lang = spacy.util.get_lang_class(spacy_lang_meta['lang'])(vocab=first_spacy_doc.vocab, meta=spacy_lang_meta) #for name in spacy_lang_meta['pipeline']: # spacy_lang.add_pipe(spacy_lang.create_pipe(name)) #return cls(spacy_lang, docs=spacy_docs) else: corpus = textacy.Corpus.load(filename) #for doc in corpus: # user_data = doc.spacy_doc.user_data # user_data['year'] = int(user_data['year']) if 'year' in user_data else 0 # doc.metadata.update(user_data) # #metadata = doc.spacy_doc.user_data['textacy']['metadata'] # #for x in ['filename', document_id]: # # if x in metadata.keys(): # # corpus[0].metadata[x] = metadata[x] return corpus
def make_corpus(df: pd.DataFrame, col_name: str, min_token_count: int) -> textacy.Corpus: spacy_records = df[col_name].apply( lambda x: textacy.make_spacy_doc(x, lang="en")) long_records = [ record for record in spacy_records if len(record) >= min_token_count ] corpus = textacy.Corpus("en", data=list(long_records)) return corpus
def setUp(self): self.spacy_lang = textacy.data.load_spacy('en') self.cw = textacy.datasets.CapitolWords() self.text = list(self.cw.texts(speaker_name={'Bernie Sanders'}, limit=1))[0] self.doc = textacy.Doc(self.text.strip(), lang=self.spacy_lang) records = self.cw.records(speaker_name={'Bernie Sanders'}, limit=10) text_stream, metadata_stream = textacy.fileio.split_record_fields( records, 'text') self.corpus = textacy.Corpus( self.spacy_lang, texts=text_stream, metadatas=metadata_stream)
def df_to_corpus(df): # Load into textacy to delimit sentences img_labels = df.to_dict(orient="records") text_stream, metadata_stream = textacy.io.split_records(img_labels, 'RESOURCE') # Load english model en = en_core_web_sm.load() corpus = textacy.Corpus(lang=en, texts=text_stream, metadatas=metadata_stream) return corpus
def get_stats(speeches): ''' INPUT: List of uncleaned speeches OUTPUT: Dataframe of readability statistics ''' corpus = textacy.Corpus(('en').decode('utf-8'), texts=speeches) speech_stats = [] for text in corpus: speech_stats.append(textacy.text_stats.readability_stats(text)) return speech_stats
def _exec_pipeline_for_sub_corpus(self, normalize_texts, batch_id, docs): # Internal function to enable multi-threaded pipeline execution sub_corpus = textacy.Corpus(self.nlp) for doc in docs: if doc['text']: if normalize_texts: spacy_doc = textacy.make_spacy_doc( (normalize(self.language, doc['text']), {'celex': doc['celex']}), self.nlp) else: spacy_doc = textacy.make_spacy_doc(doc['text'], {'celex': doc['celex']}, self.nlp) sub_corpus.add_doc(spacy_doc) return sub_corpus
def df_to_corpus(df, txt_column='RESOURCE'): # Load into textacy to delimit sentences img_labels = df.to_dict(orient="records") text_stream, metadata_stream = textacy.io.split_records( img_labels, txt_column) # Load english model corpus = textacy.Corpus(lang=en, texts=text_stream, metadatas=metadata_stream) return corpus
def createqcorpus(q): data = [tuple(x) for x in q.values] qcorpus = textacy.Corpus("en") #initialise corpus for question for r,*items in data: response = " ".join(r.split()) if len(items) == 1: qcorpus.add_text(response, metadata={"ID":str(items[0]),"categories":[], "notes":""}) elif len(items) == 2: qcorpus.add_text(response, metadata={"ID":str(items[0]),"PostID":str(items[1]),"categories":[], "notes":""}) elif len(items) == 3: qcorpus.add_text(response, metadata={"ID":str(items[0]),"PostID":str(items[1]),"ParentID":str(items[2]),"categories":[], "notes":""}) return qcorpus
def __init__(self): self._min_occurrence_for_topic = 2 self._common_verbs = 10 # create an empty corpus self._en = textacy.load_spacy_lang('en_core_web_sm', disable=('parser',)) self._corpus = textacy.Corpus(lang=self._en) self._content = None self._model = None self._numdocs = 0 self._numtopics = 0 self._terms = None self._doc_term_matrix = None self._doc_topic_matrix = None self._vectorizer = Vectorizer(tf_type='linear', apply_idf=True, idf_type='smooth', norm='l2', min_df=3, max_df=0.95, max_n_terms=100000)
def createtcorpus(filename, refId): # just reading one text for now - use tcorpus.add_texts() when multifile upload implemented tcorpus = textacy.Corpus("en") text_to_add = textacy.fileio.read.read_file(CORPUS_ROOT + filename) tcorpus.add_text(text_to_add) origpath = os.path.join(app.config['CORPUS_FOLDER'],str(refId)) if not os.path.exists(origpath): os.makedirs(origpath) fname = filename.rsplit('.', 1)[0] path = os.path.join(origpath,fname) if not os.path.exists(path): os.makedirs(path) tcorpus.save(path,name=fname,compression="gzip") return tcorpus
def process_text(lst, filepath=None, filename=None, compression=None): ''' DESC: Tokenizes and processes lst of strings using textacy. If filepath: saves corpus as pickle to filepath. --Input-- lst: list of strings filepath: (str) path to directory where textacy corpus will be saved filename: (str) name of pickled textacy corpus compression: (str) compression of metadata json ('gzip', 'bz2', 'lzma' or None) ---------------------------------- --Output-- Returns textacy corpus object, if filepath: saves textacy corpus as pickle ''' corpus = textacy.Corpus('en', texts=lst) if filepath: corpus.save(filepath, filename, compression) return corpus
def truncate_docs_in_daily_corpora(corpora): trunc_corpora = [] for corpus in corpora: trunc_corpus = textacy.Corpus('en') for doc in corpus: first_sents_spans = list(itertools.islice(doc.sents, 2)) # print(first_sents_spans) first_sents = "" for span in first_sents_spans: first_sents += str(span) + " " # print(first_sents) trunc_doc = textacy.Doc(first_sents, doc.metadata, 'en') # print(trunc_doc) trunc_corpus.add_doc(trunc_doc, doc.metadata) trunc_corpora.append(trunc_corpus) return trunc_corpora
def process_text(self, filepath=None, filename=None, compression=None): ''' DESC: Tokenizes and processes pandas DataFrame using textacy. If filepath: saves corpus as pickle to filepath. --Input-- filepath: (str) path to directory where textacy corpus will be saved filename: (str) name of pickled textacy corpus compression: (str) compression of metadata json ('gzip', 'bz2', 'lzma' or None) ---------------------------------- --Output-- Returns textacy corpus object, if filepath: saves textacy corpus as pickle ''' if len(self.text) == 0: self._get_reviews_and_label() self.corpus = textacy.Corpus('en') self.corpus.add_texts(texts=self.text, batch_size=1000, n_threads=-1) if filepath: self.corpus.save(filepath, filename, compression) print('Saved textacy corpus to filepath.') return
def normalise_text(corpus,settings): ncontractions = settings['ncontractions'] lcase = settings['lcase'] punct = settings['punct'] nums = settings['nums'] textlist = [d.text for d in corpus.docs] if ncontractions: textlist = [textacy.preprocess.unpack_contractions(w) for w in textlist] textlist = [re.sub(r"(\b)(nt|n't)", r"not", w) for w in textlist] # hack to deal with standalone n't missing from textacy unpack_contractions after tokenisation if lcase: textlist = [w.lower() for w in textlist] if punct: textlist = [strip_punctuation(w,settings) for w in textlist] if nums: textlist = [textacy.preprocess.replace_numbers(w,"") for w in textlist] ncorpus = textacy.Corpus("en") for t in textlist: ncorpus.add_text(t) return ncorpus
def pipe01(limit=0): # Define textacy doc preprocessing textacy_preprocessor = lambda text: textacy.preprocess.preprocess_text( text, no_contractions=True, no_numbers=True, no_emails=True, no_currency_symbols=True, lowercase=True) # Define nlp pipeline nlp = spacy.load("en", add_vectors=False) nlp.pipeline = [nlp.tagger, nlp.parser] # handle = article_handles.articleObjectPhysOrg # Do all the other things with utils.mongo_open('BlogData', 'PhysOrg') as conn: article_iter = conn.query(conditions={ 'html': { '$ne': 'None' }, 'url': { '$regex': '^http://phys.org/news/' } }, limit=limit) content_stream, metadata_stream = utils.proc_art_iterator( handle=handle, mdb_iterator=article_iter, tpp=textacy_preprocessor) corpus = textacy.Corpus(lang=nlp, texts=content_stream, metadatas=metadata_stream) # Return the data return (corpus)
import textacy import spacy.en import pandas as pd from sklearn.naive_bayes import MultinomialNB from sklearn.model_selection import train_test_split from sklearn import metrics from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer import pickle cw = textacy.corpora.CapitolWords() docs = cw.records(date_range=('1996-01-01', '2016-12-31')) content_stream, metadata_stream = textacy.fileio.split_record_fields( docs, 'text', 'speaker_name') corpus = textacy.Corpus(('en').decode('utf-8', 'ignore'), texts=content_stream, metadatas=metadata_stream) #Dataframe of the speaker and speeches created from the corpus df = pd.read_pickle('speeches') drop_words = [ 'objection', 'american', 'people', 'gentlewoman', 'gentleman', 'minute', 'distinguished', 'yea', 'desk', 'vote', 'internship', 'unanimous', 'consent', 'quorum', 'previous', 'session', 'amendment', 'read', 'immediate', 'consideration', 'senator', 'congress', 'house', 'rollcall', 'floor', 'desire', 'nay', 'present', 'ask', 'rescind', 'order', 'recognize', 'yield', 'question', 'authorize', 'meet', 'proceed', 'motion', 'pending', 'set', 'table', 'lie', 'president', 'speaker', 'appeal', 'ruling' ]
def start_cluster_batch(): topic_list_query = "SELECT * from sharelock.topic_list" topic_rows = session.execute(topic_list_query) topic_row_list = list(topic_rows) topic_frames = pd.DataFrame(topic_row_list) for idx, frame in topic_frames.iterrows(): topic = frame['topic'] category = frame['category'] query = "SELECT * from sharelock.active_tweets where topic='" + topic + "'order by inserted_at desc limit 30" rows = session.execute(query) ent_dict = {} sorted_json = {} row_list = [] for row in rows: xd = json.loads(row.tweet_batch) row_list = row_list + xd sorted_result = df = pd.DataFrame(data=row_list) sorted_result.set_index('tweet_id') sorted_result = sorted_result.drop_duplicates(subset='tweet_id', keep='first') # Clean results by dropping items with similarity score o.98 or higher sorted_result['tweet_tokens'] = sorted_result['tweet_text'].apply(nlp) sorted_result['tweet_clean_text'] = sorted_result['tweet_text'].apply( get_cleaned_text) sorted_result['tweet_clean_tokens'] = sorted_result[ 'tweet_clean_text'].apply(nlp) sorted_result = remove_duplicate_posts(sorted_result) corpus = textacy.Corpus(lang="en_core_web_lg", texts=list(sorted_result['tweet_text']), metadatas=list(sorted_result['tweet_id'])) terms_list = (doc.to_terms_list(ngrams=(1, 2, 3), named_entities=True, normalize=u'lemma', lemmatize=True, lowercase=True, as_strings=True, filter_stops=True, filter_punct=True, min_freq=1, exclude_pos=("PRON", "X", "PUNCT", "SYM")) for doc in corpus) vectorizer = textacy.Vectorizer(tf_type='linear', apply_idf=True, idf_type='smooth') textacy.text_utils.clean_terms(terms_list) doc_term_matrix = vectorizer.fit_transform(terms_list) num_topics = int(len(sorted_result) / 10) model = textacy.tm.TopicModel('nmf', n_topics=num_topics) model.fit(doc_term_matrix) doc_topic_matrix = model.transform(doc_term_matrix) topic_cluster = {} for topic_idx, top_terms in model.top_topic_terms( vectorizer.id_to_term, topics=-1, top_n=8, weights=True): dct = dict(top_terms) tt_list = [] for j in dct.keys(): tt_list.append({"term": j, "weight": dct[j]}) topic_cluster["topic-" + str(topic_idx)] = {"terms": tt_list} for topic_idx, top_docs in model.top_topic_docs(doc_topic_matrix, topics=-1, top_n=6, weights=True): dct = dict(top_docs) tweet_in_topic_list = [] for j in dct.keys(): query_str = "tweet_id=" + corpus[j].metadata curr = sorted_result[sorted_result['tweet_id'] == corpus[j].metadata] curr_frame_row = curr.iloc[0] is_attached_to_topic = False for prev_topic in topic_cluster: if 'tweets' in topic_cluster[prev_topic]: tweet_list = topic_cluster[prev_topic]['tweets'] for tweet in tweet_list: if tweet['tweet_id'] == curr.iloc[0]['tweet_id']: is_attached_to_topic = True break if not is_attached_to_topic: tweet_in_topic_list.append({ "tweet_id": curr.iloc[0]['tweet_id'], "tweet_text": curr.iloc[0]['tweet_text'], "user_score": str(curr.iloc[0]['user_score']), "raw_score": str(curr.iloc[0]['raw_score']) }) if tweet_in_topic_list: topic_cluster["topic-" + str(topic_idx)]['tweets'] = tweet_in_topic_list for curr_topic in topic_cluster: if 'tweets' in topic_cluster[curr_topic]: sent_weights = [] for tweet in topic_cluster[curr_topic]['tweets']: sent_weights = sent_weights + get_sent_weights( tweet, topic_cluster[curr_topic]['terms']) sent_weights = sorted(sent_weights, key=lambda x: x['final_score'], reverse=True) top_sents = sent_weights[0:2] sorted_top_sents = sorted(sent_weights, key=lambda x: x['ent_score'], reverse=True) topic_title = "" topic_title_list = [] for sent in sorted_top_sents: if sent['structure_penalty'] < 50 and sent[ 'word_score'] > 0: topic_title_list.append(sent['text'].strip('\n')) topic_cluster[curr_topic]['title'] = topic_title_list result_dict = {} for k in topic_cluster.keys(): if 'tweets' in topic_cluster[k]: result_dict[k] = topic_cluster[k] insert_at = datetime.datetime.now().timestamp() insert_values = [topic, category, insert_at, json.dumps(result_dict)] sql_query = "INSERT into sharelock.topic_clusters (topic, category, inserted_at, tweet_cluster) values (?, ?, ?, ?)" try: prepared = session.prepare(sql_query) session.execute(prepared, (insert_values)) except Exception as e: print(e)
import textacy import textacy.datasets # import numpy # import spacy # # text = ('Since the so-called "statistical revolution" in the late 1980s and mid 1990s, ' # 'much Natural Language Processing research has relied heavily on machine learning. ' # 'Formerly, many language-processing tasks typically involved the direct hand coding ' # 'of rules, which is not in general robust to natural language variation. ' # 'The machine-learning paradigm calls instead for using statistical inference ' # 'to automatically learn such rules through the analysis of large corpora ' # 'of typical real-world examples.') # # textacy.text_utils.KWIC(text, 'example', window_width=35) # # print(textacy.preprocess_text(text, lowercase=True, no_punct=True)+"\n") # # spacy.load('en') # doc = textacy.Doc(text) cw = textacy.datasets.CapitolWords() cw.download() records = cw.records(speaker_name={'Hillary Clinton', 'Barack Obama'}) text_stream, metadata_stream = textacy.fileio.split_record_fields( records, 'text') corpus = textacy.Corpus('en', texts=text_stream, metadatas=metadata_stream) print(corpus)
with left_column: pos = st.multiselect('Keep', default_pos, default=default_pos) with right_column: freq_cutoff = st.number_input('Minimum Token Freq', min_value=0, value=2) #color_option = st.radio('Wordcloud colour', ["Default", "Choose one"]) #if (color_option == "Choose one"): # colors = st.color_picker('Colour') en = textacy.load_spacy_lang("en_core_web_sm", disable=()) matcher = get_matcher(en, pos) corpus = textacy.Corpus(en, data=texts) terms = get_terms_for_wordcloud(matcher, corpus) if (len(terms) > 0): wordcloud_terms = [dict(text=k, value=v) for k, v in terms.items()] return_obj = wordcloud.visualize(wordcloud_terms, tooltip_data_fields={ 'text': 'Term', 'value': 'Frequency' }, per_word_coloring=False) #print(clicked['clicked']) if (return_obj != None and 'clicked' in return_obj): if (return_obj['clicked'] != None): focus_word = return_obj['clicked']['text']
text = row[text_column].replace("\n", " ") # use all columns as metadata, except the column with the actual text metadata = row.to_dict() del metadata[text_column] yield (text, metadata) # set correct path relative to working directory (folder where you saved this script) f_csv = "materials/dataset_speeches_federal_council_2019.csv" # stream the csv-dataset by calling the function defined above texts = get_texts_from_csv(f_csv, text_column="Text") # create a corpus with all the texts corpus_speeches = textacy.Corpus(de, data=texts) # task 5: two subcorpora # define two functions filtering by language and period # similar as the lambda functions shown in slides, yet may be better understandable def filter_func_pre(doc): return doc._.meta.get("Sprache") == "de" and doc._.meta.get("Jahr") < 2000 # greater-equal to include the year 2000 def filter_func_post(doc): return doc._.meta.get("Sprache") == "de" and doc._.meta.get("Jahr") >= 2000
def create_corpus( input_filepath, output_filepath, nlp=None, specific_stopwords=None, return_data=False, ): """ Runs data processing scripts to turn raw data from (../raw) into cleaned data ready to be analyzed (saved in ../processed). Parameters ---------- input_filepath : str Folder path storing un-mutable raw data. Use a wildcard within the file name to filter files via glob.glob. output_filepath : str File path where corpus should be saved. nlp : spaCy NLP pipeline specific_stopwords : iterable, None Case specific stopwords that are worth deleting before going into the spaCy pipeline to prevent memory allocation problems. save : bool True if corpus should be saved, else otherwise. return_data : bool whether to keep corpus in memory Returns ------- corpus: textacy.Corpus Corpus created from BBC Monitoring data stored in binary format and zipped for optimal compression """ logger = logging.getLogger(__name__) logger.info("Creating corpus from raw BBC Monitoring data") # load and configure spacy nlp model # https://stackoverflow.com/questions/52557058/spacy-nlp-pipeline-order-of-operations # ------------------------------------------------------------------------- if nlp is None: nlp = en_core_web_lg.load() nlp.max_length = int(30 * 1e6) nlp.remove_pipe("parser") nlp.remove_pipe("ner") # compile list of documents (slower, but more robust than os.listdir) # ------------------------------------------------------------------------- file_list = glob.glob(input_filepath) file_list = sorted(file_list) # iteratively read in text stream # ------------------------------------------------------------------------- records = [] for file_path in tqdm(file_list): # extract metadata of file name # --------------------------------------------------------------------- fname = file_path.split("/")[-1].split(".")[0].split("_") if len(fname) == 2: river_basin, year = fname month = np.nan elif len(fname) == 3: river_basin, year, month = fname else: raise NotImplementedError("Check needed!") metadata = {"basin": river_basin, "year": year, "month": month} # read and pre-process with nlp pipeline # --------------------------------------------------------------------- with open(file_path) as f_input: # 1) read raw text file text_raw = f_input.read() # 2) pre-process with utils (textacy only, or textacy & gensim) text = preprocess_text( text_raw, char_count_filter=True, stopwords=specific_stopwords, min_len=3, max_len=15, ) # 3) create doc with metadata doc = textacy.make_spacy_doc(data=(text, metadata), lang=nlp) # 4) append record records.append(doc) # build corpus # --------------------------------------------------------------------- corpus = textacy.Corpus(nlp, data=records) corpus.save(output_filepath) # optionally keep corpus in memory if return_data: return corpus else: return None