# Prep dictionaries of English words from nltk.corpus import words # Dictionary of 236K English words from NLTK english_nltk = set(words.words()) # Make callable english_long = set() # Dictionary of 467K English words from https://github.com/dwyl/english-words fname = "../../../models_storage/word_embeddings_data/english_words.txt" # Set file path to long english dictionary with open(fname, "r") as f: for word in f: english_long.add(word.strip()) # Create useful lists using above functions: stop_words_list = stopwords_make() punctstr = punctstr_make() unicode_list = unicode_make() print("Stopwords, Unicodes, Punctuations lists creation complete!") #df = df.reset_index(drop=True) # # # # # # # # # Commented code below was the one not utilizing phrase detection nor tqdm for timing # docs_tagged = [] # s_count = 0 #initializing for checking the number of schools processed # for i in range(len(df)): # school = df['text'][i] # doc = [] # s_count += 1
d = {'filename': filename_ls, 'text': text_ls} df = pd.DataFrame(d) # Prep dictionaries of English words from nltk.corpus import words # Dictionary of 236K English words from NLTK english_nltk = set(words.words()) # Make callable english_long = set( ) # Dictionary of 467K English words from https://github.com/dwyl/english-words fname = "../../../models_storage/word_embeddings_data/english_words.txt" # Set file path to long english dictionary with open(fname, "r") as f: for word in f: english_long.add(word.strip()) # ## Create lists of stopwords, punctuation, and unicode characters stop_words_list = stopwords_make( ) # Define old vocab file path if you want to remove first, dirty elements unicode_list = unicode_make() punctstr = punctstr_make() print("Stopwords, Unicodes, Punctuations lists creation complete!") #word2vec computation whole_text_unnested = [] whole_text_nested = [] tqdm.pandas(desc="Cleaning text") for school in tqdm(df['text'], desc="Cleaning text"): doc = [] for chunk in school.split("\n"): for sent in sent_tokenize(chunk): sent = clean_sentence_apache(sent,