def store_sentence_lengths(dataset): path = '../data/'+dataset docs, labels = read_files(path) lengths = [] for doc in docs: sentences = preprocess.tokenize_sentences(doc) for s in sentences: tokens = preprocess.tokenize_tokens(s) lengths.append(len(tokens)) print lengths pickle_to_file(lengths, 'output/sentence-lengths/'+dataset)
def _cooccurrence_preprocessing(doc, context, already_preprocessed): """Preprocess document as needed for co-occurrence network creation""" if context=='window': if already_preprocessed: doc = doc.split(' ') else: doc = preprocess.preprocess_text(doc) elif context=='sentence': doc = preprocess.tokenize_sentences(doc) for i, sentence in enumerate(doc): sentence = preprocess.preprocess_text(sentence) doc[i] = sentence return doc
def _cooccurrence_preprocessing(doc, context, already_preprocessed): """Preprocess document as needed for co-occurrence network creation""" if context == 'window': if already_preprocessed: doc = doc.split(' ') else: doc = preprocess.preprocess_text(doc) elif context == 'sentence': doc = preprocess.tokenize_sentences(doc) for i, sentence in enumerate(doc): sentence = preprocess.preprocess_text(sentence) doc[i] = sentence return doc
def plot_sentence_lengths(datafile=None): """ Function for plotting histogram of sentence lengths within a given dataset. """ if datafile is None: import preprocess print '> reading data..' path = '../data/tasa/TASA900_text' texts, labels = data.read_files(path) sentence_lengths = [] print '> counting lengths..' for text in texts: sentences = preprocess.tokenize_sentences(text) for sentence in sentences: tokens = preprocess.tokenize_tokens(sentence) sentence_lengths.append(len(tokens)) data.pickle_to_file(sentence_lengths, 'output/tasa_sentence_lengths.pkl') else: sentence_lengths = data.pickle_from_file(datafile) plotter.histogram(sentence_lengths, 'sentence length (tokens)', '# sentences', bins=70)
def summarize(article): try: _create_unverified_https_context = ssl._create_unverified_context except AttributeError: pass else: ssl._create_default_https_context = _create_unverified_https_context #nltk.download('averaged_perceptron_tagger') sentences = preprocess.tokenize_sentences(article) clean_sentences = pdpip3.Series(sentences).str.replace("[^a-zA-Z]", " ") clean_sentences = [s.lower() for s in clean_sentences] clean_sentences = [ preprocess.remove_stopwords(r.split()) for r in clean_sentences ] word_embeddings = {} f = open('/Users/apple/Downloads/glove.6B/glove.6B.100d.txt', encoding='utf-8') for line in f: values = line.split() word = values[0] coefs = np.asarray(values[1:], dtype='float32') word_embeddings[word] = coefs f.close() sentence_vectors = [] for i in clean_sentences: if len(i) != 0: v = sum( [word_embeddings.get(w, np.zeros((100, ))) for w in i.split()]) / (len(i.split()) + 0.001) else: v = np.zeros((100, )) sentence_vectors.append(v) sim_mat = np.zeros([len(sentences), len(sentences)]) for i in range(len(sentences)): for j in range(len(sentences)): if i != j: sim_mat[i][j] = cosine_similarity( sentence_vectors[i].reshape(1, 100), sentence_vectors[j].reshape(1, 100))[0, 0] nx_graph = nx.from_numpy_array(sim_mat) scores = nx.pagerank(nx_graph) ranked_sentences = sorted( ((scores[i], s) for i, s in enumerate(sentences)), reverse=True) # r = Rake() ques = [] for i in range(len(ranked_sentences)): tokens = [] print(ranked_sentences[i][1]) article = ranked_sentences[i][1] print("Article:", article) # r.extract_keywords_from_text(ranked_sentences[i][1]) # print("*********************") # print(r.get_ranked_phrases()) # To get keyword phrases ranked highest to lowest. # tokens.extend(r.get_ranked_phrases()) # lis = [] # for i in range(len(tokens)): # if len(tokens[i].split()) > 1: # lis.extend(nltk.word_tokenize(tokens[i])) # # else: # lis.append(tokens[i]) # print("Parts of speech tagging: ", pos_tag(lis)) # for i in range(len(ranked_sentences)): doc = nlp(article) print("DOC", doc.ents) print([(X.text, X.label_) for X in doc.ents]) for X in doc.ents: if X.label_: print("Inside for") article = article.replace(X.text, "__________") ques.append(article) break #print(ques) #print(type(ques)) print(i + 1, ":", article) print(ques) return ques
import argparse