Esempio n. 1
0
def getStopWordList():
    # read the stopwords file and build a list
    punctuation = list(string.punctuation)
    stop = stopwords.words('english') + punctuation + [
        'AT_USER', 'URL', 'url', 'retweet', 'rt'
    ]
    return stop
Esempio n. 2
0
File: main.py Progetto: rcbm/read-hn
 def get(self):
     if not db.GqlQuery("SELECT * FROM StopWord").get():
         import stopwords as s
         db.put([StopWord(word = w) for w in s.words()])
         
     for x in range(10):
         taskqueue.add(url="/scrape_bot", params={'start': (x*100)})
Esempio n. 3
0
    def Add(self, doc):
        flag = 0

        #Format the input string to a proper json format
        formatted_json = (doc.replace("{", "{\"").replace(
            ":", "\":\"").replace("}",
                                  "\"}").replace(",",
                                                 "\",\"").replace("'", ""))

        #Read the Json to python
        json_doc = dict(json.loads(formatted_json))

        #Write the Json document to a csv file, Create one if it does not exist already
        with open(DocumentPath, 'a+') as csvfile:
            flag = 0

        with open(DocumentPath, 'r') as csvfile:
            csv_dict = [row for row in csv.DictReader(csvfile)]
            if len(csv_dict) == 0:
                flag = 1

        f = csv.writer(open(DocumentPath, "ab+"))
        if (flag == 1):
            f.writerow(["id", "text"])  #Header for csv file
        f.writerow([json_doc['id'], json_doc['text']])

        #Creating tokens and normalising them, removing stopwords as well

        stop_words = nltk.download('stopwords')

        stop_words = (stopwords.words('english'))

        norm_tokens = []

        tokens = json_doc['text'].split()  # split string into a list

        for temp in tokens:
            temp = temp.lower()
            if temp not in stop_words:
                norm_tokens.append(temp)

        index = []

        #Add/Update the index with new tokens
        for token in norm_tokens:
            if token in self.invertedindex.keys():
                self.invertedindex[token] = (self.invertedindex[token] + "/" +
                                             json_doc['id'])
            else:
                self.invertedindex[token] = json_doc['id']

        #Save the index to csv file
        with open(InvertedIndexPath, 'w') as f:
            for key in self.invertedindex.keys():
                f.write("%s,%s\n" % (key, self.invertedindex[key]))

        print("Added" + " " + formatted_json)

        return self.dFrameDocuments, self.documents
Esempio n. 4
0
def stopw(words, lang='french'):
    if lang is 'french':
        stop_words = custom_fr_stopwords()
        words = [w for w in words if w not in stop_words]
    else:
        words = [w for w in words if w not in stopwords.words(lang)]

    return words
Esempio n. 5
0
def rank(data, stopword=False):
    """เรียงจำนวนคำของประโยค
	รับค่าเป็น ''list'' คืนค่าเป็น ''dict'' [ข้อความ,จำนวน]"""
    if stopword == False:
        rankdata = Counter(data)
    else:
        data = [word for word in data if word not in stopwords.words('thai')]
        rankdata = Counter(data)
    return rankdata
def cleanText(text):
    stopWords = stopwords.words("english")
    clean_words = []
    words_no_punc = tokenizationWord(removePunctuation(text))
    for w in words_no_punc:
        if w not in stopWords:
            clean_words.append(w.lower())
    txt = ''
    for w in clean_words:
        txt += w + ' '
    return txt
Esempio n. 7
0
def find_keyword(word_list,lentext=3):
    '''
    ระบบค้นหาคำสำคัญ
    หลักการ ลบ stopword ออกแล้ว นับจำนวนคำที่ส่งค่าออกมา

    find_keyword(word_list,lentext=3)
    word_list คือ คำที่อยู่ใน list
    lentext คือ จำนวนคำที่มีอยู่ใน list สำหรับใช้กำหนดค่าหา keyword ค่าเริ่มต้นคือ 3
    '''
    filtered_words = [word for word in word_list if word not in set(stopwords.words('thai'))]
    word_list=rank(filtered_words)
    return {k:v for k, v in word_list.items() if v>=lentext}
Esempio n. 8
0
import urllib.request
import nltk
# import nltk.corpus
import stopwords

response = urllib.request.urlopen('https://en.wikipedia.org/wiki/SpaceX')
html = response.read()
# print(html)

from bs4 import BeautifulSoup

soup = BeautifulSoup(html, 'html5lib')
text = soup.get_text(strip=True)
# print(text)

tokens = [t for t in text.split()]
# print(tokens)

from nltk.corpus import stopwords

sr = stopwords.words('english')
clean_tokens = tokens[:]
for token in tokens:
    if token in stopwords.words('english'):

        clean_tokens.remove(token)
freq = nltk.FreqDist(clean_tokens)
for key, val in freq.items():
    print(str(key) + ':' + str(val))
freq.plot(20, cumulative=False)
Esempio n. 9
0
def reverseIndex():
    global t_c;
    global d_c;
    global termID;
    global docID;
    global term_doc_invertedID
    global BUFFER_SIZE
    f_c = len(os.listdir(d_p))
    p_c = 0 #check the number of files parsed
    stop = set(stopwords.words('english'))
    
    #print(f_c)

    ##
    # Trying our best to access the files 
    ##

    #get the matching data id (directory folder) and file name
    for dir_folder in os.listdir(d_p):
        if (dir_folder.startswith('bookkeeping.json')):
            with open(d_p + str(dir_folder)) as json_file:
                json_file = json.load(json_file)
                    #print(json_file)
                for k, v in json_file.items():
                    print ("Data ID: " + k + "URL: " + v)
                    docid = k
                    docUrl = v
                    docID[docid] = docUrl

    #Continue to access each file and token all the words and process reverse indexing 
    for dir_folder in os.listdir(d_p): #get all the folder
        try:
            #lets look for the data within the text files inside folder directory
            if not dir_folder.startswith('.') | dir_folder.startswith('bookkeeping.tsv') | dir_folder.startswith('bookkeeping.json'):#if they are folder
                #print(len(os.listdir(d_p + dir_folder)))
                print(dir_folder)
                for index in os.listdir(d_p + dir_folder):
                    with open(d_p + str(dir_folder) + '/' + index,"rb") as raw_data_file: #this is opening individual files inside the folder
                        #print("Get raw_data_file")
                        #print("Path: " + d_p + str(dir_folder) + '/' + index)
                        #now that we can access the folder text we can work on it

                        #ID in the json file....use this to find the url 
                        di_id = dir_folder + '/' + index #this is equivalent of #/# 
                        
                        lines = raw_data_file.readlines()
                        #print(lines)
                        for line in lines:
                            line = re.sub('[^a-zA-Z0-9]', ' ', line)
                            #print(line)
                            for word in [k.lower() for k in line.split()]:
                                #word from the line starts here
                                #print(word)
                                if (word not in stop) and (not word.isdigit()):
                                    value = termID.get(word)                                
                                    if (not value):
                                        termID[word] = t_c
                                        term_doc_invertedID[t_c] = Set([di_id])
                                        #takes the doc id 
                                        t_c = t_c + 1
                                    else:
                                        term_doc_invertedID[value].add(di_id) #use set
                    #count as finished parsing a file                                
                    p_c = p_c + 1;
                #show overall progress .... for our debug
                progress = (p_c/float(f_c * 500)) * 100
                sys.stdout.write("Parsing files... %d%%  \r" % progress)
                if (progress != 100):
                    sys.stdout.flush()
                else:
                    sys.stdout.write('\n')
            
                        

        except ValueError:
            print('No Valid json ' + dir_folder)
    #output term id, doc id, and index
  
    with open('termID.txt','w') as t_id, open('docID.txt', 'w') as d_id, open('invertedIndex.txt', 'w') as i_id:
        termID = collections.OrderedDict(sorted(termID.items(), key = lambda x:x[1]))
        docID = collections.OrderedDict(sorted(docID.items(), key = lambda x:x[1]))
        m_t = len(termID)
        m_d = len(docID)
        term_count = 0
        doc_count = 0
        t_d_count = 0
        
        for k, v in termID.iteritems():
            t_id.write(str(v) + ' ' + k + '\n')
            term_count = term_count + 1
            progress = (term_count/float(m_t)) * 100
            sys.stdout.write("Writing to file... %d%%  \r" % (progress))
            if (progress != 100):
                sys.stdout.flush()
            else:
                sys.stdout.write('\n')

        for k, v in docID.iteritems():
            d_id.write(str(k) + ' ' + v + '\n')
            doc_count = doc_count + 1
            progress = (doc_count/float(m_d)) * 100
            if(progress%10==0):sys.stdout.write("Writing to file... %d%% \r" % (progress))
            if(progress != 100):
                sys.stdout.flush()
            else:
                sys.stdout.write('\n')

        for k,v in term_doc_invertedID.iteritems():
            list_doc = str(len(v))
            for document_id in set(v):#make it set
                list_doc += ' ' + str(document_id)
            i_id.write(str(k) + ' ' + list_doc + '\r\n')
            t_d_count = t_d_count + 1;
            progress = (t_d_count/float(m_t)) * 100
            sys.stdout.write("Writing inverted index.... %d%% \r" % (progress))
            if (progress != 100):
                sys.stdout.flush()
            else:
                sys.stdout.write('\n')
            
    print('Total terms: ' + str(m_t))
    print('Total docs: ' + str(m_d))
    print('...Completed Reverse Indexing') 
import re
# Preprocessing the data
txt = re.sub(r'\[[0-9]*\]', ' ', paragraph)
txt = re.sub(r'\s+', ' ', txt)
txt = txt.lower()
txt = re.sub(r'\d', ' ', txt)
txt = re.sub(r'\s+', ' ', txt)

# Preparing the dataset
cleans = nltk.sent_tokenize(txt)

cleans = [nltk.word_tokenize(clean) for clean in cleans]

for i in range(len(cleans)):
    cleans[i] = [
        word for word in cleans[i] if word not in stopwords.words('english')
    ]

# Training the Word2Vec model
model = Word2Vec(cleans, min_count=1)

words = model.wv.vocab

# Finding Word Vectors
vector = model.wv['war']

# Most similar words
similar = model.wv.most_similar('vikram')
print(similar)
#print (vector)
Esempio n. 11
0
def tokenize_text(cleaned_text) -> list:
    """Also filter out stop words"""
    tknzr = TweetTokenizer()
    token = tknzr.tokenize(cleaned_text)
    return [i for i in token if i not in stopwords.words()]