def load_spacy_model(): text, tokens, keywords = extract('uploads/test4.pdf') nlp = spacy.load('en_core_web_lg') doc = nlp(text) nlp.to_disk('spacy_model')
from gensim.models import word2vec from sklearn.manifold import TSNE import matplotlib.pyplot as plt import matplotlib as mpl from wordcloud import WordCloud, STOPWORDS from stopwords import stop_word_list from pdf_extractor import extract import spacy stop_words = stop_word_list() text, tokens, keywords = extract('uploads/mytest.pdf') for word in tokens: if word in stop_words: tokens.remove(word) cleantext = " ".join(tokens) nlp = spacy.load('en_core_web_sm') # make sure to use larger model! doc = nlp(cleantext) list_of_lists = [] for sentence in doc.sents:
from gensim.models import word2vec from sklearn.manifold import TSNE import matplotlib.pyplot as plt import matplotlib as mpl from wordcloud import WordCloud, STOPWORDS from stopwords import stop_word_list from pdf_extractor import extract import spacy stop_words = stop_word_list() text, tokens, keywords = extract('uploads/test4.pdf') for word in tokens: if word in stop_words: tokens.remove(word) cleantext = " ".join(tokens) nlp = spacy.load('en_core_web_sm') # make sure to use larger model! doc = nlp(cleantext) list_of_lists = [] for sentence in doc.sents: inner_list = [] for token in sentence: inner_list.append(token.text)