def getStopWordList(): # read the stopwords file and build a list punctuation = list(string.punctuation) stop = stopwords.words('english') + punctuation + [ 'AT_USER', 'URL', 'url', 'retweet', 'rt' ] return stop
def get(self): if not db.GqlQuery("SELECT * FROM StopWord").get(): import stopwords as s db.put([StopWord(word = w) for w in s.words()]) for x in range(10): taskqueue.add(url="/scrape_bot", params={'start': (x*100)})
def Add(self, doc): flag = 0 #Format the input string to a proper json format formatted_json = (doc.replace("{", "{\"").replace( ":", "\":\"").replace("}", "\"}").replace(",", "\",\"").replace("'", "")) #Read the Json to python json_doc = dict(json.loads(formatted_json)) #Write the Json document to a csv file, Create one if it does not exist already with open(DocumentPath, 'a+') as csvfile: flag = 0 with open(DocumentPath, 'r') as csvfile: csv_dict = [row for row in csv.DictReader(csvfile)] if len(csv_dict) == 0: flag = 1 f = csv.writer(open(DocumentPath, "ab+")) if (flag == 1): f.writerow(["id", "text"]) #Header for csv file f.writerow([json_doc['id'], json_doc['text']]) #Creating tokens and normalising them, removing stopwords as well stop_words = nltk.download('stopwords') stop_words = (stopwords.words('english')) norm_tokens = [] tokens = json_doc['text'].split() # split string into a list for temp in tokens: temp = temp.lower() if temp not in stop_words: norm_tokens.append(temp) index = [] #Add/Update the index with new tokens for token in norm_tokens: if token in self.invertedindex.keys(): self.invertedindex[token] = (self.invertedindex[token] + "/" + json_doc['id']) else: self.invertedindex[token] = json_doc['id'] #Save the index to csv file with open(InvertedIndexPath, 'w') as f: for key in self.invertedindex.keys(): f.write("%s,%s\n" % (key, self.invertedindex[key])) print("Added" + " " + formatted_json) return self.dFrameDocuments, self.documents
def stopw(words, lang='french'): if lang is 'french': stop_words = custom_fr_stopwords() words = [w for w in words if w not in stop_words] else: words = [w for w in words if w not in stopwords.words(lang)] return words
def rank(data, stopword=False): """เรียงจำนวนคำของประโยค รับค่าเป็น ''list'' คืนค่าเป็น ''dict'' [ข้อความ,จำนวน]""" if stopword == False: rankdata = Counter(data) else: data = [word for word in data if word not in stopwords.words('thai')] rankdata = Counter(data) return rankdata
def cleanText(text): stopWords = stopwords.words("english") clean_words = [] words_no_punc = tokenizationWord(removePunctuation(text)) for w in words_no_punc: if w not in stopWords: clean_words.append(w.lower()) txt = '' for w in clean_words: txt += w + ' ' return txt
def find_keyword(word_list,lentext=3): ''' ระบบค้นหาคำสำคัญ หลักการ ลบ stopword ออกแล้ว นับจำนวนคำที่ส่งค่าออกมา find_keyword(word_list,lentext=3) word_list คือ คำที่อยู่ใน list lentext คือ จำนวนคำที่มีอยู่ใน list สำหรับใช้กำหนดค่าหา keyword ค่าเริ่มต้นคือ 3 ''' filtered_words = [word for word in word_list if word not in set(stopwords.words('thai'))] word_list=rank(filtered_words) return {k:v for k, v in word_list.items() if v>=lentext}
import urllib.request import nltk # import nltk.corpus import stopwords response = urllib.request.urlopen('https://en.wikipedia.org/wiki/SpaceX') html = response.read() # print(html) from bs4 import BeautifulSoup soup = BeautifulSoup(html, 'html5lib') text = soup.get_text(strip=True) # print(text) tokens = [t for t in text.split()] # print(tokens) from nltk.corpus import stopwords sr = stopwords.words('english') clean_tokens = tokens[:] for token in tokens: if token in stopwords.words('english'): clean_tokens.remove(token) freq = nltk.FreqDist(clean_tokens) for key, val in freq.items(): print(str(key) + ':' + str(val)) freq.plot(20, cumulative=False)
def reverseIndex(): global t_c; global d_c; global termID; global docID; global term_doc_invertedID global BUFFER_SIZE f_c = len(os.listdir(d_p)) p_c = 0 #check the number of files parsed stop = set(stopwords.words('english')) #print(f_c) ## # Trying our best to access the files ## #get the matching data id (directory folder) and file name for dir_folder in os.listdir(d_p): if (dir_folder.startswith('bookkeeping.json')): with open(d_p + str(dir_folder)) as json_file: json_file = json.load(json_file) #print(json_file) for k, v in json_file.items(): print ("Data ID: " + k + "URL: " + v) docid = k docUrl = v docID[docid] = docUrl #Continue to access each file and token all the words and process reverse indexing for dir_folder in os.listdir(d_p): #get all the folder try: #lets look for the data within the text files inside folder directory if not dir_folder.startswith('.') | dir_folder.startswith('bookkeeping.tsv') | dir_folder.startswith('bookkeeping.json'):#if they are folder #print(len(os.listdir(d_p + dir_folder))) print(dir_folder) for index in os.listdir(d_p + dir_folder): with open(d_p + str(dir_folder) + '/' + index,"rb") as raw_data_file: #this is opening individual files inside the folder #print("Get raw_data_file") #print("Path: " + d_p + str(dir_folder) + '/' + index) #now that we can access the folder text we can work on it #ID in the json file....use this to find the url di_id = dir_folder + '/' + index #this is equivalent of #/# lines = raw_data_file.readlines() #print(lines) for line in lines: line = re.sub('[^a-zA-Z0-9]', ' ', line) #print(line) for word in [k.lower() for k in line.split()]: #word from the line starts here #print(word) if (word not in stop) and (not word.isdigit()): value = termID.get(word) if (not value): termID[word] = t_c term_doc_invertedID[t_c] = Set([di_id]) #takes the doc id t_c = t_c + 1 else: term_doc_invertedID[value].add(di_id) #use set #count as finished parsing a file p_c = p_c + 1; #show overall progress .... for our debug progress = (p_c/float(f_c * 500)) * 100 sys.stdout.write("Parsing files... %d%% \r" % progress) if (progress != 100): sys.stdout.flush() else: sys.stdout.write('\n') except ValueError: print('No Valid json ' + dir_folder) #output term id, doc id, and index with open('termID.txt','w') as t_id, open('docID.txt', 'w') as d_id, open('invertedIndex.txt', 'w') as i_id: termID = collections.OrderedDict(sorted(termID.items(), key = lambda x:x[1])) docID = collections.OrderedDict(sorted(docID.items(), key = lambda x:x[1])) m_t = len(termID) m_d = len(docID) term_count = 0 doc_count = 0 t_d_count = 0 for k, v in termID.iteritems(): t_id.write(str(v) + ' ' + k + '\n') term_count = term_count + 1 progress = (term_count/float(m_t)) * 100 sys.stdout.write("Writing to file... %d%% \r" % (progress)) if (progress != 100): sys.stdout.flush() else: sys.stdout.write('\n') for k, v in docID.iteritems(): d_id.write(str(k) + ' ' + v + '\n') doc_count = doc_count + 1 progress = (doc_count/float(m_d)) * 100 if(progress%10==0):sys.stdout.write("Writing to file... %d%% \r" % (progress)) if(progress != 100): sys.stdout.flush() else: sys.stdout.write('\n') for k,v in term_doc_invertedID.iteritems(): list_doc = str(len(v)) for document_id in set(v):#make it set list_doc += ' ' + str(document_id) i_id.write(str(k) + ' ' + list_doc + '\r\n') t_d_count = t_d_count + 1; progress = (t_d_count/float(m_t)) * 100 sys.stdout.write("Writing inverted index.... %d%% \r" % (progress)) if (progress != 100): sys.stdout.flush() else: sys.stdout.write('\n') print('Total terms: ' + str(m_t)) print('Total docs: ' + str(m_d)) print('...Completed Reverse Indexing')
import re # Preprocessing the data txt = re.sub(r'\[[0-9]*\]', ' ', paragraph) txt = re.sub(r'\s+', ' ', txt) txt = txt.lower() txt = re.sub(r'\d', ' ', txt) txt = re.sub(r'\s+', ' ', txt) # Preparing the dataset cleans = nltk.sent_tokenize(txt) cleans = [nltk.word_tokenize(clean) for clean in cleans] for i in range(len(cleans)): cleans[i] = [ word for word in cleans[i] if word not in stopwords.words('english') ] # Training the Word2Vec model model = Word2Vec(cleans, min_count=1) words = model.wv.vocab # Finding Word Vectors vector = model.wv['war'] # Most similar words similar = model.wv.most_similar('vikram') print(similar) #print (vector)
def tokenize_text(cleaned_text) -> list: """Also filter out stop words""" tknzr = TweetTokenizer() token = tknzr.tokenize(cleaned_text) return [i for i in token if i not in stopwords.words()]