def preprocess_text(article, truncate=False, return_string=False): ''' Cleans up articles by removing page marker junk, unicode formatting, and extra whitespaces; re-joining words split by (hyphenated at) end of line; removing numbers (by default) and acronyms (not by default); tokenizing sentences into words using the Apache Lucene Tokenizer (same as JSTOR); lower-casing words; removing stopwords (same as JSTOR), junk formatting words, junk sentence fragments, and proper nouns (the last not by default). Args: article (str): raw OCR'd academic article text truncate (False or integer): whether to keep only first num words from each article (like an abstract) return_string (binary): whether to return str (instead of list of str) Returns: str or list of str: each element of list is a word ''' if truncate: return_string_temp = False # need to return tokenized version to count words in article else: return_string_temp = return_string # Remove page marker junk article = article.replace('<plain_text><page sequence="1">', '') article = re.sub(r'</page>(\<.*?\>)', ' \n ', article) article = clean_sentence_apache(article, unhyphenate=True, remove_numbers=False, remove_acronyms=False, remove_stopwords=False, remove_propernouns=False, return_string=return_string_temp) if truncate: article = article[: truncate] # keep only num (=truncate) first words in each article if truncate and return_string: # join into string here if not earlier article = ' '.join(article) return article
unicode_list = unicode_make() punctstr = punctstr_make() print("Stopwords, Unicodes, Punctuations lists creation complete!") #word2vec computation whole_text_unnested = [] whole_text_nested = [] tqdm.pandas(desc="Cleaning text") for school in tqdm(df['text'], desc="Cleaning text"): doc = [] for chunk in school.split("\n"): for sent in sent_tokenize(chunk): sent = clean_sentence_apache(sent, unhyphenate=True, remove_propernouns=False, remove_acronyms=False) sent = [word for word in sent if word != ''] if len(sent) > 0: whole_text_unnested.append(sent) doc.append(sent) whole_text_nested.append(doc) print("Saving the Cleaned Sentences as lists...") print("Saving List 1: Flattened list") quickpickle_dump( whole_text_unnested, "../../../models_storage/word_embeddings_data/cleaned_text_flat_2020_oct17_1990.pkl" ) print("Pickle file 1 saved!") print("Saving List 2: Nested list")
def preprocess_text(article, shorten=False, longest=999999, shortest=0, maxlen=999999, minlen=0): ''' Cleans up articles by removing page marker junk, unicode formatting, and extra whitespaces; re-joining words split by (hyphenated at) end of line; removing numbers (by default) and acronyms (not by default); tokenizing sentences into words using the Apache Lucene Tokenizer (same as JSTOR); lower-casing words; removing stopwords (same as JSTOR), junk formatting words, junk sentence fragments, and proper nouns (the last not by default). Args: article (str): lots of sentences with punctuation etc, often long shorten (boolean): if True, shorten sentences to at most maxlen words longest (int): number of words in longest article in corpus (get this elsewhere) shortest (int): number of words in shortest article in corpus (depends on filtering) maxlen (int): maximum number of words to return per article; default is huge number, set lower if shorten == True minlen (int): minimum number of words to return per article Returns: list of lists of str: each element of list is a sentence, each sentence is a list of words ''' # Remove page marker junk article = article.replace('<plain_text><page sequence="1">', '') article = re.sub(r'</page>(\<.*?\>)', ' \n ', article) article = re.sub(r'<.*?>', '', article) article = re.sub(r'<.+?>', '', article) article = re.sub(r'\\\w+(\[.*?\])*(\{.*?\})*', '', article) # Compute maximum length for this article: from minlen to maxlen, gradated depending on longest if shorten: article_length = len(article.split( )) # tokenize (split by spaces) then count # words in article if article_length > minlen: # if article is longer than minimum length to extract, decide how much to extract maxlen = get_maxlen(article_length, longest, shortest, maxlen, minlen) elif article_length <= minlen: # if article isn't longer than minimum length to extract, just take whole thing shorten = False # don't shorten doc = [] # list to hold tokenized sentences making up article numwords = 0 # initialize word counter if shorten: while numwords < maxlen: # continue adding words until reaching maxlen for sent in article.split('\n'): #sent = clean_sent(sent) sent = [ word for word in clean_sentence_apache(sent, unhyphenate=True, remove_numbers=True, remove_acronyms=False, remove_stopwords=False, remove_propernouns=False, return_string=False) if word != '' ] # remove empty strings if numwords < maxlen and len(sent) > 0: gap = int(maxlen - numwords) if len( sent ) > gap: # if sentence is bigger than gap between current numwords and max # words, shorten it sent = sent[:gap] doc.append(sent) numwords += len(sent) if len(sent) > 0: doc.append(sent) numwords += len(sent) else: # take whole sentence (don't shorten) for sent in article.split('\n'): #sent = clean_sent(sent) sent = [ word for word in clean_sentence_apache(sent, unhyphenate=True, remove_numbers=True, remove_acronyms=False, remove_stopwords=False, remove_propernouns=False, return_string=False) if word != '' ] # remove empty strings if len(sent) > 0: doc.append(sent) return doc
articles = pd.read_csv("../../../models_storage/word_embeddings_data/filtered_index.csv", names=colnames, header=None) files_to_be_opened = ["../../../jstor_data/ocr/" + file + '.txt' for file in articles.file_name] all_files = ['../../../jstor_data/ocr/' + f for f in listdir(ocr_wd) if isfile(join(ocr_wd, f))] files = [file for file in all_files if file in files_to_be_opened] #initializing two lists for strings from files and the filenames text_ls = [] filename_ls = [] index = 1 for file in files: with open(file, 'r') as myfile: data = myfile.read() data = data.replace('<plain_text><page sequence="1">', '') data = re.sub(r'</page>(\<.*?\>)', ' \n ', data) data = clean_sentence_apache(data, unhyphenate=True, remove_propernouns=False, remove_acronyms=False, return_string=True) text_ls.append(data) filename_ls.append(file.replace('../ocr/', '')) if index % 1000 == 0: print("Cleaned ", index, " documents.") index += 1 print("Text Cleaning completed!") d = {'filename': filename_ls, 'text': text_ls} df = pd.DataFrame(d) print("Shortening texts...") df["edited_filename"] = df['filename'].apply(lambda x: x[40:-4]) df.text = df.text.apply(lambda x: x[:10000] if len(x) > 10000 else x) #cutting down to 10000 words max