예제 #1
0

# Prep dictionaries of English words
from nltk.corpus import words # Dictionary of 236K English words from NLTK
english_nltk = set(words.words()) # Make callable
english_long = set() # Dictionary of 467K English words from https://github.com/dwyl/english-words
fname =  "../../../models_storage/word_embeddings_data/english_words.txt" # Set file path to long english dictionary
with open(fname, "r") as f:
    for word in f:
        english_long.add(word.strip())
        


    
# Create useful lists using above functions:
stop_words_list = stopwords_make()
punctstr = punctstr_make()
unicode_list = unicode_make()


print("Stopwords, Unicodes, Punctuations lists creation complete!")

#df = df.reset_index(drop=True)

# # # # # # # # # Commented code below was the one not utilizing phrase detection nor tqdm for timing 
# docs_tagged = []
# s_count = 0 #initializing for checking the number of schools processed
# for i in range(len(df)):
#     school = df['text'][i]
#     doc = []
#     s_count += 1
예제 #2
0
d = {'filename': filename_ls, 'text': text_ls}
df = pd.DataFrame(d)

# Prep dictionaries of English words
from nltk.corpus import words  # Dictionary of 236K English words from NLTK
english_nltk = set(words.words())  # Make callable
english_long = set(
)  # Dictionary of 467K English words from https://github.com/dwyl/english-words
fname = "../../../models_storage/word_embeddings_data/english_words.txt"  # Set file path to long english dictionary
with open(fname, "r") as f:
    for word in f:
        english_long.add(word.strip())

# ## Create lists of stopwords, punctuation, and unicode characters
stop_words_list = stopwords_make(
)  # Define old vocab file path if you want to remove first, dirty elements
unicode_list = unicode_make()
punctstr = punctstr_make()

print("Stopwords, Unicodes, Punctuations lists creation complete!")

#word2vec computation
whole_text_unnested = []
whole_text_nested = []
tqdm.pandas(desc="Cleaning text")

for school in tqdm(df['text'], desc="Cleaning text"):
    doc = []
    for chunk in school.split("\n"):
        for sent in sent_tokenize(chunk):
            sent = clean_sentence_apache(sent,