def import_data(a_file, a_row, b_file, b_row): a_content = [] a_content_1 = open(a_file, 'r') csv_reader_a = csv.reader(a_content_1) for row in csv_reader_a: row_new = remove_stopwords(row[a_row]) row_new = strip_numeric(row_new) row_new = strip_non_alphanum(row_new) row_new = strip_short(row_new, minsize=3) a_content.append(row_new) a_length = len(a_content) a_label = np.ones(a_length) a_label = a_label.tolist() b_content = [] b_content_1 = open(b_file, 'r') csv_reader_b = csv.reader(b_content_1) for row in csv_reader_b: row_new = remove_stopwords(row[a_row]) row_new = strip_numeric(row_new) row_new = strip_non_alphanum(row_new) row_new = strip_short(row_new, minsize=3) b_content.append(row_new) b_length = len(b_content) b_label = np.zeros(b_length) b_label = b_label.tolist() return a_content, a_label, b_content, b_label
def import_data(file): human = [] machine = [] content = open(file, 'r') csv_reader = csv.reader(content) for row in csv_reader: row1 = unicode(row[2], errors='ignore') row_new1 = remove_stopwords(row1) row_new1 = strip_numeric(row_new1) #row_new = strip_non_alphanum(row_new) row_new1 = strip_short(row_new1, minsize=3) human.append(row_new1) row2 = unicode(row[3], errors='ignore') row_new2 = remove_stopwords(row2) row_new2 = strip_numeric(row_new2) #row_new = strip_non_alphanum(row_new) row_new2 = strip_short(row_new2, minsize=3) machine.append(row_new2) length = len(human) human_label = np.ones(length) human_label = human_label.tolist() machine_label = np.zeros(length) machine_label = machine_label.tolist() return human, human_label, machine, machine_label
def clean_compute_similarity(d1, d2): #print(type(d1)) #print(type(d2)) d1 = remove_stopwords(d1).split() d2 = remove_stopwords(d2).split() #print(d1) #print(d2) # Dictionary and Corpus documents = [d1, d2] dictionary = corpora.Dictionary(documents) # Composing the similarity matrix similarity_matrix = fasttext_model300.similarity_matrix(dictionary, tfidf=None, threshold=0.0, exponent=2.0, nonzero_limit=100) # Conversion of sentences into bag-of-words vectors - The function doc2bow() simply counts the number of occurrences of each distinct word, converts the word to its integer word id and returns the result as a sparse vector. d1 = dictionary.doc2bow(d1) d2 = dictionary.doc2bow(d2) #print(d1) #print(d2) # Soft cosine similarity - Considers similarities between pairs of features score = softcossim(d1, d2, similarity_matrix) return score
def preprocessing(text): '''Preprocesses a text using standard gensim techniques: removes stopwords, strips short words (1-2 characters), strips numbers, strips http addresses, strips Unicode from emoji etc., lowercases everything, strips extra spaces, punctuation, non-alphanumeric symbols. Also perform stemming input: text: a string returns: the preprocessed string. ''' text = text.lower() text = preprocess.remove_stopwords(text) # remove stop words text = preprocess.strip_short(text) #get rid of short words text = preprocess.strip_numeric(text) #get rid of numbers p = re.compile(r'(http.*\s)|(http.*$)') text = p.sub('',text) p = re.compile(r'[^\x00-\x7F]+') text = p.sub('',text) text = preprocess.strip_multiple_whitespaces(text) text = preprocess.strip_punctuation(text) text = preprocess.strip_non_alphanum(text) text = preprocess.remove_stopwords(text) text = preprocess.strip_short(text) # stemming words = text.split() stemmed_words = [stemmer.stem(word) for word in words] text = ' '.join(stemmed_words) return text
def getTopics(question, answer): table = str.maketrans(dict.fromkeys( string.punctuation)) # OR {key: None for key in string.punctuation} new_q = question.translate(table) new_a = answer.translate(table) # print("q", new_q) # print("a", new_a) # print("astop", remove_stopwords(new_a)) questionClean = remove_stopwords(new_q).lower().split() answerClean = remove_stopwords(new_a).lower().split() allWords = questionClean + answerClean # print("allWords", allWords) maxScore = 0 maxTopic = "" maxWord = "" maxTopicScores = {topic: (0, "") for topic in allTopics} for topic in allTopics: for word in allWords: try: curScore = wv.similarity(word, topic) except KeyError: curScore = 0 if curScore > maxScore: maxScore = curScore maxTopic = topic maxWord = word if maxTopicScores[topic][0] < curScore: maxTopicScores[topic] = (curScore, word) print(maxTopicScores) if maxScore > 0.15: print("maxscore", maxScore, maxWord) return [maxTopic] print("maxscore", maxScore, maxWord) return []
def match(entity='', subject=''): tokenizer = RegexpTokenizer(r'\w+') lemmatizer = WordNetLemmatizer() entity = [lemmatizer.lemmatize(e) for e in tokenizer.tokenize(remove_stopwords(entity).lower()) if all([not e.isnumeric(), not e[0].isnumeric(), len(e)>2])] subject = [lemmatizer.lemmatize(e) for e in tokenizer.tokenize(remove_stopwords(subject).lower()) if all([not e.isnumeric(), not e[0].isnumeric(), len(e)>2])] return not all([e not in subject for e in entity])
def readCorpus(fname, tokens_only=False, mode='w'): tokens = [] with smart_open.smart_open(fname, encoding="iso-8859-1") as f: for i, line in enumerate(f): if(mode == 's'): tokens.append(split_sentences(remove_stopwords(line))) else: # Train text with or without tags tokens.append(gensim.utils.simple_preprocess(remove_stopwords(line))) return tokens
def test_strip_stopwords(self): self.assertEqual(remove_stopwords("the world is square"), "world square") # confirm redifining the global `STOPWORDS` working with mock.patch('gensim.parsing.preprocessing.STOPWORDS', frozenset(["the"])): self.assertEqual(remove_stopwords("the world is square"), "world is square")
def cosine_distance(a,b): a = remove_stopwords(a) b = remove_stopwords(b) a_avg = None b_avg = None for w in a.split(" "): a_avg = a_avg + model[w] if a_avg else model[w] for w in b.split(" "): b_avg = b_avg + model[w] if b_avg else model[w] a_avg /= len(a.split(" ")) b_avg /= len(b.split(" ")) return (1 - spatial.distance.cosine(a_avg, b_avg))
def string_processor(token): # str = str(token) str = unidecode(token) str = remove_stopwords(str) str = strip_punctuation(str) str = remove_stopwords(str) # str = strip_non_alphanum(str) # will rm all puncs tokens = sp(str) tokens = [token.lemma_ for token in tokens ] # lemma_ replace all 'I' to '-PRON-', sorce code bug # tokens = [porter_stemmer.stem(token) for token in tokens] str = " ".join(tokens) str = strip_multiple_whitespaces(str) str = str.strip(' ') return str
def cohesiveness_between_chapters(self, document): ''' Compute cohesiveness between chapters using latent semantic analysis :param document: document to be processed, a list of chapters. :return: cohesivess matrix ''' document = [' '.join(chapter) for chapter in document] document = [remove_stopwords(chapter).split() for chapter in document] dictionary = corpora.Dictionary(document) corpus = [dictionary.doc2bow(chapter) for chapter in document] tfidf = models.TfidfModel(corpus) corpus_tfidf = tfidf[corpus] lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=2) corpus_lsi = lsi[corpus_tfidf] index = similarities.MatrixSimilarity(corpus_lsi) sims = index[corpus_lsi] # index = similarities.MatrixSimilarity(corpus_tfidf)// Similarity with tf-idf # sims = index[corpus_tfidf] #print(index) #print(sims) return sims
def scrub_stopwords(txt: str, lib_sw: str = None) -> str: """ Removes stopwords from text using a choice of libraries. Reference: - https://medium.com/towards-artificial-intelligence/stop-the-stopwords-using-different-python-libraries-ffa6df941653 # noqa: E501 :param txt: String to pass in to remove stopwords. :param lib_sw: String of the library to use to remove stopwords. :return: String that has had its stopwords removed. """ if lib_sw is None: pass elif lib_sw == 'sklearn': txt = [word for word in txt.split() if word not in ENGLISH_STOP_WORDS] txt = ' '.join(txt) return txt elif lib_sw == 'nltk': txt = [word for word in txt.split() if word not in STOPWORDS_NLTK] txt = ' '.join(txt) return txt elif lib_sw == 'spacy': txt = [word for word in txt.split() if word not in STOPWORDS_SPACY] txt = ' '.join(txt) return txt elif lib_sw == 'gensim': txt = remove_stopwords(txt) return txt else: raise Exception( f"Sorry, entered library, {lib_sw}, is not recognised.\n" + "Please enter one from [None, 'sklearn', 'nltk', 'spacy', 'gensim']" )
def clean_sentence(sentence, stopwords=False): sentence = sentence.lower().strip() sentence = re.sub(r'[^a-z0-9\s]', '', sentence) if stopwords: sentence = remove_stopwords(sentence) return sentence
def send(self, s): tweet = json.loads(str(s)) if not 'user' in tweet: return if not tweet['lang'] == 'en': return txt = re.sub("[^\w\s@#]+", '', tweet['text']).lower() txt = str(remove_stopwords(' '.join(sorted(txt.split())))) self.meth(txt)
def process_review_raw_data(self): print("Review data pre-processing start...") _reviews = [] with open(config.path2datasets + self.dataset_name, 'r') as f: for line in f.readlines(): review_json = json.loads(line) _business_id = review_json['business_id'] _review_id = review_json['review_id'] _stars = review_json['stars'] _text = review_json['text'] # remove punctuation _text = strip_punctuation(_text) _text = remove_stopwords(_text) _text = _text.lower() _reviews.append({ 'review_id': _review_id, 'business_id': _business_id, 'stars': _stars, 'text': _text }) _reviews = pd.DataFrame(_reviews) _reviews.to_csv(config.path2data + self.dataset_name + "." + config.path2reviews) _reviews = None print("Review data pre-processing DONE")
def wordcloud_auto(df): """ Takes a df, and turns it into a word cloud, if possible. The data frame must have a column named 'text' in order for this function to run properly. """ if 'text' in df.columns: df['gs_remove'] = df.text.apply(lambda x: remove_stopwords(x)) df['nlp'] = df.gs_remove.apply(lambda x: nlp(x)) lemma = [] for i in iter(df.nlp): for j in i: lemma.append(j.lemma_) STOPWORDS.add('PRON') stopwords = STOPWORDS wordcloud = WordCloud(stopwords=stopwords, background_color='White', width=1000, height=500, max_words=30).generate(' '.join(lemma)) plt.figure(figsize=(24, 16)) plt.imshow(wordcloud, interpolation='bilinear') plt.axis('off') plt.show() else: print( "Cannot locate the text column. Please use pd.dataframe.rename() to specify the column." )
def _proc_sent(self, sent, rm_dialog, rm_stop, stem, rm_short=None, min_nw_sent=3): sent = sent.lower() sent = re.sub(r'\s+', ' ', sent).strip() # remove extra spaces if not sent: return None if rm_short and len(nltk.tokenize.word_tokenize(sent)) < min_nw_sent: return None if rm_dialog: dialog_tokens = ["''", "``"] for tk in dialog_tokens: if tk in sent: logger.info('Remove dialog') return None if config.test_year == '2005' and sent[0] == "'" and ( 'says' in sent or 'said' in sent): logger.info('Remove dialog') return None if rm_stop: sent = remove_stopwords(sent) if stem: sent = self.porter_stemmer.stem_sentence(sent) return sent
def index(): index_id = request.json if not index_id: abort(400) posted_fields = index_id.keys() required_fields = {'id'} if not required_fields <= posted_fields: abort(400, f'Missing fields: {required_fields - posted_fields}') idx = index_id.get('id') timelines_req = requests.get('http://localhost:5100/timelines/id/'+idx) data = timelines_req.json()['data'][0]['text'] lower = data.lower() for c in string.punctuation: lower = lower.replace(c, "") remove_sw = remove_stopwords(lower) tokens = remove_sw.split() for t in tokens: r.sadd(t, idx) response.status = 200 return timelines_req.json()
def data_clean(path_to_data, path_to_label): ''' Inputs: path_to_data: path to data.txt path_to_label: path to label.txt Outputs: A pandas dataframe with the preprocessed data with the respective category labels ''' data = [] df = pd.read_excel(path_to_label) with open(path_to_data) as file: for line in file: line = regex(line) # to remove stopwords line = remove_stopwords(line) data.append(line.strip().lower()) if data[-1] == "------------------------------------------------" \ "------------------------------------------------------": del data[-1] string = "" privacy_preprocessed = [] for item in data[1:]: if item != data[0]: string += item else: privacy_preprocessed.append(string) string = "" privacy_preprocessed.append(string) df['Privacy_Policies'] = privacy_preprocessed df["len"] = df["Privacy_Policies"].apply(lambda x: len(x)) df.drop(df[df["len"] == 0].index, inplace=True) return df
def clean_documents(documents): documents_clean = [] p.set_options(p.OPT.URL, p.OPT.EMOJI, p.OPT.SMILEY, p.OPT.NUMBER) for d in documents: # Remove Unicode d = d.lower() # removing url,emoji,smiley,number document_test = p.clean(d) #remove stop_words document_test = remove_stopwords(document_test) document_test = re.sub(r'[^\x00-\x7F]+', ' ', document_test) # Remove Mentions document_test = re.sub(r'@\w+', '', document_test) # Lowercase the document # Remove punctuations document_test = re.sub(r'[%s]' % re.escape(string.punctuation), ' ', document_test) # Lowercase the numbers document_test = re.sub(r'[0-9]', '', document_test) # Remove the doubled space documents_clean.append(document_test) return documents_clean
def custom_preprocess(sentence): #Define a custom preprocess function for the test documents, this can also be applied to pandas dataframe series sentence = sentence.lower() no_stopwords = remove_stopwords(sentence) tokens = tokenize(no_stopwords) no_punctuation = strip_punctuation(no_stopwords) unwanted = remove_unwanted(no_punctuation) return unwanted
def train_word2vec(self): if self.train_documents is None: self.prepare_train_documents() print("\t. Estimating Word2Vec model") print("\t. Loading training documents") counter = 0 all_docs = [] for train_doc in self.train_documents: doc = train_doc[:150000] if len(train_doc) > 150000 else train_doc if (counter%100) == 0: print("{0} .. len: {1}".format(counter,len(doc))) counter += 1 doc = remove_stopwords(doc) # doc = re.sub(r'[^\w\s]','',doc) doc_tokens =nltk.word_tokenize(doc.lower()) all_docs.append(doc_tokens) print("Creating all tagged documents") documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(all_docs)] print("\t. Run model") model = Doc2Vec(documents = documents, vector_size=700, window=7, min_count =3) print() print("\t. Done") self.word2vec_model = model
def clean_text(text): """ Cleans the text in the only argument in various steps ARGUMENTS: text: content/title, string RETURNS: cleaned text, string""" if isfloat(text): try: if math.isnan(text): return '' except TypeError: print('text: {}'.format(text)) return '' # Replace newlines by space. We want only one doc vector. text = text.replace('\n', ' ').lower() # Expand contractions: you're to you are and so on. # text = contractions.fix(text) # Remove stop words text = preprocessing.remove_stopwords(text) # Remove html tags and numbers: can numbers possible be useful? text = preprocessing.strip_tags(preprocessing.strip_numeric(text)) # Remove punctuation -- all special characters text = preprocessing.strip_multiple_whitespaces( preprocessing.strip_punctuation(text)) #text = re.sub(r'[^\w\s]', '', text.lower()) # STEMMING (Porter) automatically lower-cases as well # To stem or not to stem, that is the question #text = preprocessing.stem_text(text) return text
def getLemmatizedText(name, content, language): language = language[:2] language = language.lower() outText = "" if (language): if (language=="is"): outText = getLemmatizedTextIS(name, content) print("IS") else: outText = lemmatizerMultilanguage.getLemmatizedText(language, name+" "+content) print(language.upper()) else: text = name+" "+content outText = text.lower().replace('.','.') print("ERROR: No language for Lemmatizing text") cleaned = re.sub(' +', ' ',outText) cleaned = cleaned.replace('\n', '') cleaned = cleaned.replace('\r', '') cleaned = remove_stopwords(cleaned) cleaned = strip_tags(cleaned) cleaned = strip_punctuation(cleaned) cleaned = strip_numeric(cleaned) cleaned = strip_short(cleaned, 1) cleaned = strip_multiple_whitespaces(cleaned) cleaned = cleaned.lower() print("Lemmatized CLEAN: "+cleaned) return cleaned
def __init__(self, df, gram=1, n_most=20): self.df = df self.ngrams = [] self.index = [] self.value = [] for sentence in self.df.text: sentence = remove_stopwords(sentence) splitted = sentence.split(' ') for element in splitted: if element == '': splitted.remove(element) while len(splitted) > (gram - 1): self.ngrams.append(tuple(splitted[0:gram])) splitted.pop(0) self.count = Counter(self.ngrams).most_common(n_most) for i in self.count: if len(self.count[0][0]) == 2: self.index.append('\n'.join([i[0][0], i[0][1]])) self.value.append(i[1]) elif len(self.count[0][0]) == 3: self.index.append('\n'.join([i[0][0], i[0][1], i[0][2]])) self.value.append(i[1]) else: print('Neither 2 nor 3') break
def preprocess(rdd): """ Pre-process tweets in rdd so they'll be suitable for use in the downstream topology """ return rdd.map( # xform json into dicts lambda js: json.loads(js[1]) ).filter( # analyze only tweets from users (skip "delete" messages, eg) lambda tweet: 'user' in tweet ).filter( # don't analyze our own tweets lambda tweet: tweet['user']['id_str'] != me ).filter( # english only lambda tweet: 'lang' in tweet and tweet['lang'] == 'en' ).map( # pluck out tweet's author & text & downcase tweet text lambda tweet: (tweet['user']['screen_name'], tweet['text'].lower()) ).map( # kill punctuation, except for @mentions and #hashtags and spaces lambda t: (t[0], re.sub("[^\w\s@#]+", '', t[1])) ).map( # add text w/ stop words removed lambda t: (t[0], t[1], remove_stopwords(t[1])) ).map( # pprint() can only handle ascii, it seems lambda t: [ _.encode('ascii','ignore') for _ in t ] )
def clean(doc): for i in range(doc.shape[0]): #lowercasing doc.set_value(i, doc.iloc[i].lower()) #print("LOWERCASE") #print(doc) for i in range(doc.shape[0]): #remove punctuation doc.set_value(i, re.sub(r'([^\s\w])+', '', doc.iloc[i])) #print("REMOVE PUNCT") #print(doc) for i in range(doc.shape[0]): #remove stopwords doc.set_value(i, remove_stopwords(doc.iloc[i])) #print("REMOVE STOPWORDS") #print(doc) for i in range(doc.shape[0]): #tokenize doc.set_value(i, word_tokenize(doc.iloc[i])) #print("TOKENIZE") #print(doc) for i in range(doc.shape[0]): #lemmatize for j in range(len(doc.iloc[i])): doc.iloc[i][j] = lemmatizer.lemmatize(doc.iloc[i][j])
def preprocess_tweet(tweet): """ This function will preprocess the input tweet Steps for preprocessing: 1. Lowercase the letters 2. Replace the characters with frequency greater than 3 with 3 in a word 3. Replace a url with Tag: <URLURL> 4. Replace a tag mention: <UsernameMention> @TODO: 1. Look for better preprocessing methods on the web 2. Apply here """ clean_tweet = tp.clean(tweet) # perform lemmatization tokenizer = TweetTokenizer() tweet_tokens = tokenizer.tokenize(clean_tweet) lemmatized_tweet = lemmatize_tweet(tweet_tokens) # remove stopwords preprocessed_tweet = remove_stopwords(lemmatized_tweet) return preprocessed_tweet
def write_discharge_summaries(out_file): notes_file = '%s/NOTEEVENTS.csv' % (MIMIC_3_DIR) print("processing notes file") with open(notes_file, 'r') as csvfile: with open(out_file, 'w') as outfile: print("writing to %s" % (out_file)) outfile.write( ','.join(['SUBJECT_ID', 'HADM_ID', 'CHARTTIME', 'TEXT']) + '\n') notereader = csv.reader(csvfile) #header next(notereader) i = 0 for line in tqdm(notereader): subj = int(line[1]) category = line[6] if category == "Discharge summary": note = line[10] #tokenize, lowercase and remove numerics #tokens = [t.lower() for t in tokenizer.tokenize(note) if not t.isnumeric()] #text = '"' + ' '.join(tokens) + '"' text = remove_stopwords(udf_clean(note)) outfile.write(','.join([line[1], line[2], line[4], text]) + '\n') i += 1 return out_file
def toSentences(pageList, language='English', keywords=None): # convert into long string (from list of page texts) longString = ''.join(pageList).replace('\n', ' ') # Remove Stop Words sentences_nostops = remove_stopwords(longString) # split into list of sentences sentences = nltk.sent_tokenize(sentences_nostops) if keywords: sentences = extractKeywordSentences(sentences, keywords) # Convert sentences to list of words data_words = list(sent_to_words(sentences)) # Form Bigrams #sentences_bigrams = make_bigrams(data_words) sentences_bigrams = data_words # Initialize spacy 'en' model, keeping only tagger component (for efficiency) # python3 -m spacy download en nlp = spacy.load('en', disable=['parser', 'ner']) sentences_lemmatized = lemmatization( nlp, sentences_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']) return sentences_lemmatized
def load_top_topics(docs=[], cnt_topics=5): docs = [remove_stopwords(doc) for doc in docs] #remove stopwords tokenizer = RegexpTokenizer(r'\w+') #=> https://www.kite.com/python/docs/nltk.RegexpTokenizer for i in range(len(docs)): docs[i] = docs[i].lower() #lower strings docs[i] = tokenizer.tokenize(docs[i]) #split strings into tokens docs = [[token for token in doc if not token.isnumeric() and not token[0].isnumeric()] for doc in docs] #exclude numbers docs = [[token for token in doc if len(token) > 1] for doc in docs] #exclude too short tokens lemmatizer = WordNetLemmatizer() #=> https://www.nltk.org/_modules/nltk/stem/wordnet.html docs = [[lemmatizer.lemmatize(token) for token in doc] for doc in docs] #group similar words dictionary = Dictionary(docs) #create dictionary corpus = [dictionary.doc2bow(doc) for doc in docs] #create corpus model = LdaModel( #=> https://radimrehurek.com/gensim/models/ldamodel.html corpus=corpus, id2word=dictionary, # chunksize=2000, # alpha='auto', # eta='auto', iterations=200, # passes=20, # eval_every=None, num_topics=cnt_topics ) top_topics = model.top_topics(corpus) #[([(a, x), ..., (a, x)], a), ...] return top_topics
def freq_for_all(): conn = sqlite3.connect('stonks.db') c = conn.cursor() c.execute("select text from posts where text <> '[removed]'") total = c.fetchall() for i in total: for x in i: filtered = remove_stopwords(x) split = filtered.split() for z in split: if z in my_stop_words: pass else: word_list.append(z) conn.commit() conn.close() for word in word_list: d[word] = d.get(word, 0) + 1 word_freq = [] for key, value in d.items(): word_freq.append((value, key)) word_freq.sort() print(word)
def save_word_dict(text): proc_text = [] sentences = text sentences = tokenize.sent_tokenize(sentences) for sentence in sentences: sentence_without_stops = remove_stopwords(sentence) sentence_without_stops = stem_text(sentence_without_stops) sentence_without_stops = strip_short(sentence_without_stops) sentence_without_stops = strip_punctuation(sentence_without_stops) proc_sentence = word_tokenize(sentence_without_stops.lower()) if len(proc_sentence) == 0: continue proc_text.append(proc_sentence) dictionary = corpora.Dictionary(proc_text) return [dictionary, proc_text, sentences]
def testStripStopwords(self): self.assertEqual(remove_stopwords("the world is square"), "world square")