def tokenize(sText, pairing=False): '''Given a string of text sText, returns a list of the individual stemmed tokens that occur in that string (in order). This is my quick and dirty Tokenizer. Satisfaction Not Guarenteed''' import string from stemmer import PorterStemmer sText = sText.lower() sText = re.sub("’", "'", sText) sText = re.sub("&.{0,6};", " ", sText) sText = re.sub("[\x80-\xff]", "", sText) sText = sText.split(None) for p in string.punctuation.replace("'", ""): try: sText = mapAndFold(lambda x: x.split(p), sText) except TypeError: # empty string return [] sText = mapAndFold(lambda x: x.split(), sText) sText = map(lambda x: x.strip("\'"), sText) sText = map(lambda x: x.strip("\""), sText) sText = map(lambda x: x.strip("_"), sText) sText = filter(lambda x: not re.match("\d+", x), sText) sText = filter(lambda x: not x == "", sText) sText = filter(lambda x: not x[0] == "@", sText) stemmer = PorterStemmer() if pairing: #return original with token val in tuple return [(w, stemmer.stem(w, 0, len(w) - 1)) for w in sText] return [stemmer.stem(w, 0, len(w) - 1) for w in sText]
def __init__(self, fileA, fileB): self.__allWords = set() self.__wordsA = dict() self.__wordsB = dict() with open(fileA, 'r') as document: for line in document: words = line.strip().split() for word in words: p = PorterStemmer() word = p.stem(word, 0, len(word) - 1) if word in self.__wordsA.keys(): self.__wordsA[word] += 1 else: self.__wordsA[word] = 1 with open(fileB, 'r') as document: for line in document: words = line.strip().split() for word in words: p = PorterStemmer() word = p.stem(word, 0, len(word) - 1) if word in self.__wordsB.keys(): self.__wordsB[word] += 1 else: self.__wordsB[word] = 1 self.__allWords = set(self.__wordsA.keys()) | set(self.__wordsB.keys()) self.__table = {t[1]: t[0] for t in enumerate(self.__allWords)}
def tokenize(sText, pairing = False): '''Given a string of text sText, returns a list of the individual stemmed tokens that occur in that string (in order). This is my quick and dirty Tokenizer. Satisfaction Not Guarenteed''' import string from stemmer import PorterStemmer sText = sText.lower() sText = re.sub("’", "'", sText) sText = re.sub("&.{0,6};", " ", sText) sText = re.sub("[\x80-\xff]", "", sText) sText = sText.split(None) for p in string.punctuation.replace("'", ""): try: sText = mapAndFold(lambda x: x.split(p), sText) except TypeError: # empty string return [] sText = mapAndFold(lambda x: x.split(), sText) sText = map(lambda x: x.strip("\'"), sText) sText = map(lambda x: x.strip("\""), sText) sText = map(lambda x: x.strip("_"), sText) sText = filter(lambda x: not re.match("\d+", x), sText) sText = filter(lambda x: not x == "", sText) sText = filter(lambda x: not x[0] == "@", sText) stemmer = PorterStemmer() if pairing: #return original with token val in tuple return [(w,stemmer.stem(w, 0, len(w)-1)) for w in sText] return [stemmer.stem(w, 0, len(w)-1) for w in sText]
def stemizeQuery(query): # STEMIZER FOR QUERY porter = PorterStemmer() newList = [] for q in query: newList.append(porter.stem(q,0,len(q)-1)) return newList
def stemize(dictList): #STEMIZER FOR DICTIONARY porter = PorterStemmer() for dict in dictList: for token in dict.keys(): dict[porter.stem(token,0,len(token)-1)] = dict.pop(token) return dictList
def ReadInfoFile(infoFile): global text2pddl; fin = open(infoFile) lines = fin.readlines() fin.close() lines = [ line.strip() for line in lines ]; unitDict = {} p = PorterStemmer() for line in lines: if len(line)==0: continue; parts = line.split(':'); textName = parts[0]; words = [ p.stem(w.lower(),0,len(w)-1) for w in textName.split() ] textName = ' '.join(words); if len(parts)>1: pddlName = parts[1]; else: pddlName = parts[0]; assert(pddlName!=''); assert(textName!=''); for word in words: unitDict[word] = True #print unitDict.keys() return unitDict
def stemWords(listTokens): s = PorterStemmer() stemmedTerms = [] for x in listTokens: stemmedTerms.append(s.stem(x, 0, len(x) - 1)) return stemmedTerms
def stemWords(inList): outList = [] ps = PorterStemmer() for token in inList: stemmed_token = ps.stem(token, 0, len(token) - 1) outList.append(stemmed_token) return outList
def stem_phrase(phrase): words = phrase.lower().replace('.', '').replace("'", '').split() # ignore stop words words = [word for word in words if not word in STOP_WORDS] p = PorterStemmer() return [p.stem(word, 0, len(word)-1) for word in words]
def stemWords(tokens): """Function that stems the words. """ # use porter stemmer # https://tartarus.org/martin/PorterStemmer/python.txt p = PorterStemmer() for index, word in enumerate(tokens): tokens[index] = p.stem(word, 0, len(word) - 1) return tokens
def stem(word): # word needs to be all lowercase before being passed to stem string.lower(word) # fancy stuff to remove .,?!" mymatch = re.compile('(\,|\.|\!|\?|\")') word = mymatch.sub(r'',word) p = PorterStemmer() word = p.stem(word, 0,len(word)-1) return word
def splitToken(token,isStem=True): toks = token.split('_') word = toks[0].lower() tag = toks[1] if not word.isalnum(): tag = 'PUNC' if isStem: # simple post stem p = PorterStemmer() #word = p.stem1(word,0,len(word)-1) word = p.stem(word,0,len(word)-1) return (word, tag)
def GetWordDictionary(emails): word_dict = {} count = 0 stemmer = PorterStemmer() for email_case in emails: email = email_case[0] body = SplitText(email['body']) for word in body: modified_word = word if len(modified_word) > 1: modified_word = stemmer.stem(word, 0, len(word) - 1) if modified_word not in word_dict: word_dict[modified_word] = count count += 1 return word_dict
def main(): # Reading the document from the file fileName = "cran.all.1400" documents = readFromFile(fileName, "r") # Reading stop words from the file stopwordsList = readFromFile("stopwords.txt", "r") stopwords = stopwordsList.split() # List that maintains the document id number, number of unique terms in document, for each term in the document, its term and it's term frequency. docId = 1 # InvFileHash invFileHash = {} # Splits the multiple documents of the same file into list document = re.split(".I | \n.I", documents)[1:] totalDocument = len(document) print "Total documents:", totalDocument for doc in enumerate(document): startIndex = doc[1].index('.W\n') text = doc[1][startIndex + 3:] words = re.findall(r'\w+', text) pObj = PorterStemmer() listWords = {} for word in words: flagStopwords = word.lower() in stopwords if (not flagStopwords and word.isalpha()): stemWord = pObj.stem(word.lower(), 0, len(word) - 1) listWords = addToDict(listWords, stemWord) docList = addDoc(docId, listWords) docId += 1 invFileHash = createInvFileHash(invFileHash, docList) # Writes to the Inverted File Hash file writeToFile(invFileHash) # To read the queries list from the cran query file queryFileRead = readFromFile("cran.qry", "r") # Calculate the Vector Space Model (total number of documents, stopwords list) vectorSpaceModel(totalDocument, queryFileRead, stopwords)
def classify(folds, nb_or_svm, ngrams, stemming, binary): p = PorterStemmer() vectorizer = CountVectorizer(input="filename", \ ngram_range=ngrams, \ tokenizer=(lambda d: [(p.stem(t, 0, len(t)-1) if stemming else t) for t in d.split()]), \ binary=binary, \ min_df=4, max_df=1.0) X = vectorizer.fit_transform([f[0] for fold in folds for f in fold]) accuracies = [] for i in range(len(folds)): classifier = SVC(gamma="auto", kernel="linear") if nb_or_svm[0] == "svm" \ else MultinomialNB(alpha=(1.0 if nb_or_svm[1] else 1.0e-10)) start_index = 0 for j in range(i): start_index += len(folds[j]) end_index = start_index + len(folds[i]) test_set = X[start_index:end_index] training_set = vstack([X[:start_index], X[end_index:]]) classifier.fit( training_set, [f[1] for fold in (folds[:i] + folds[i + 1:]) for f in fold]) correct_predictions = 0 results = classifier.predict(test_set) for j in range(len(results)): correct_predictions += int(results[j] == folds[i][j][1]) accuracies.append(100 * correct_predictions / len(results)) if nb_or_svm[0] != "svm": print("smoothed" if nb_or_svm[1] else "unsmoothed", end=" ") print("stemmed" if stemming else "unstemmed", \ "presence" if binary else "frequency", \ "unigrams" if ngrams == (1, 1) else \ ("bigrams" if ngrams == (2, 2) else \ ("uni + bi" if ngrams == (1, 2) else "unknown")), \ "accuracy:", sum(accuracies)/len(accuracies))
def stemizeList(normalList): # STEMIZER FOR LIST porter = PorterStemmer() newList = [] newDict = {} count = 0; for lists in normalList: tokenList = [] for token in lists: #print normalList.index(lists)," ",lists.index(token) tokenList.append(porter.stem(token,0,len(token)-1)) if token in newDict: count = newDict[token] newDict[token] = count +1 else: newDict[token] = 1 newList.append(tokenList) #token = porter.stem(token,0,len(token)-1) return newList,newDict
def GetPddlObj(_sWord): global text2pddl; if text2pddl == None: ReadMinecraftDict(minecraftDictFile); setObjs = set(); p = PorterStemmer(); sLastWord = p.stem(_sWord.lower(), 0, len(_sWord)-1); if sLastWord == 'block': return setObjs; #print sLastWord; for sTextName in text2pddl.keys(): if text2pddl[sTextName] == 'NULL': continue; lstWords = sTextName.split(' '); sLastTextWord = lstWords[len(lstWords)-1]; if sLastTextWord == 'block': if len(lstWords) == 1: continue; sLastTextWord = lstWords[len(lstWords)-2]; if sLastTextWord == sLastWord: setObjs.add(text2pddl[sTextName]); return list(setObjs);
def get_postlist(stop_answer, stem_answer, dict_terms): if stop_answer == 'no': stopwords = [] if stop_answer == 'yes': stopwords = [ 'i', 'a', 'about', 'an', 'and', 'are', 'as', 'at', 'be', 'by', 'for', 'from', 'how', 'in', 'is', 'it', 'of', 'on', 'or', 'that', 'the', 'this', 'to', 'was', 'what', 'when', 'where', 'who', 'will', 'with', 'the' ] ps = PorterStemmer() position_list = [] dict_posting = {} counter = 0 for key, value in dict_terms.items(): if isinstance(value, dict): for k, v in value.items(): if k == 'abstract': val = v.replace(',', '').lower().split() for index, word in enumerate(val): if stem_answer == 'no': stem_word = word if stem_answer == 'yes': stem_word = ps.stem(word, 0, len(word) - 1) if stem_word not in stopwords: if stem_word not in dict_posting: dict_posting[stem_word] = {} if key not in dict_posting[stem_word]: dict_posting[stem_word][key] = {} dict_posting[stem_word][key][ 'frequency'] = 0 dict_posting[stem_word][key][ 'position'] = [] dict_posting[stem_word][key]['frequency'] += 1 dict_posting[stem_word][key][ 'position'].append(index) with open('posting_list.json', 'w') as outfile: json.dump(dict_posting, outfile) print("Finished writing the posting list") return dict_posting
def ReadMinecraftDict(infoFile): global text2pddl; fin = open(infoFile) lines = fin.readlines() fin.close() lines = [ line.strip() for line in lines ]; text2pddl = {} p = PorterStemmer() for line in lines: if len(line)==0: continue; parts = line.split(':'); textName = parts[0]; words = [ p.stem(w.lower(),0,len(w)-1) for w in textName.split() ] textName = ' '.join(words); if len(parts)>1: pddlName = parts[1]; else: pddlName = parts[0]; assert(pddlName!=''); assert(textName!=''); text2pddl[textName] = pddlName;
class Cleaner(object): def __init__(self, stopwords): self.stopwords = stopwords self.stemmer = PorterStemmer() def clean_word(self, word): word = word.strip().lower() word = filter(lambda c: c.isalnum(), word) if word in self.stopwords: word = None else: word = self.stemmer.stem(word, 0, len(word) - 1) return word def clean_wordlist(self, wordlist): wordlist = " ".join(wordlist).replace('-', ' ').split() clean_list = map(lambda x: self.clean_word(x), wordlist) return [word for word in clean_list if word] @staticmethod def make_printable(phrase): return filter(lambda c: c in string.printable, phrase)
def GetTFIDF(): emails = None x_vals = [] y_vals = [] stemmer = PorterStemmer() # Get email chains with open("balanced_chains.pickle", "rb") as fp1: # Unpickling emails = pickle.load(fp1) np.random.shuffle(emails) i = 0 text_data = [] for i in range(0, len(emails)): print "Evaluation Email %d" % (i) email, next_email, time_diff, bucket = emails[i] # if int(np.round(time_diff / 60)) > 72: # continue # Create stemmed body and append to text_data new_str = "" words = email['body'].split() for word in words: new_word = stemmer.stem(word, 0, len(word) - 1) new_str += new_word + " " new_str = new_str[:-1] text_data.append(new_str) # Append hour y_vals.append(int(np.round(time_diff / 60))) #y_vals.append(int(time_diff) b = np.array(y_vals) count_vect = CountVectorizer() X_train_counts = count_vect.fit_transform(text_data) tf_transformer = TfidfTransformer(use_idf=True).fit(X_train_counts) X_train_tf = tf_transformer.transform(X_train_counts) return X_train_tf, b, count_vect, tf_transformer, text_data
def process(text): '''Returns a list of words after carying out the following text preprocessing and normalization steps''' # Convert text to lower case text = text.lower() #Remove 'Subject' text = re.sub(r'^sub(ject)?', ' ', text) # Strip HTML text = re.sub(r'<.*?>', ' ', text) # Normalize URLs text = re.sub(r'(http|https|ftp)://\S*', ' httpaddr ', text) # Normalize email addresses text = re.sub(r'[\w.+-]+@[\w.-]+', ' emailaddr ', text) # Normalize numbers text = re.sub(r'\b\d[\d,]*[.]*[\d]*\b', ' number ', text) # Normalize Dollars/Rupees text = re.sub(r'(\$|\brs\b|₹|£)+', ' dollar ', text) # Remove non-word characters text = re.sub(r'[^a-z]+', ' ', text) # Strip all whitespace characters and generate list of words # Stop Word Removal # stop_words = pickle.load(open('stopwords_set.pyset', 'rb')) text = [ word for word in text.split() if word not in process.stop_words and len(word) > 2 ] # Word Stemming p = PorterStemmer() result = [] for word in text: try: stem_word = p.stem(word, 0, len(word) - 1) if stem_word not in process.stop_words: result.append(stem_word) except: pass return result
def vectorSpaceModel(totalDocument, queryFileRead, stopwords): """ Query to calculate the cosine similarity between document d and Query Q """ # Loads the inverted File Hash dic = loadFromFile() # queryList = processQueryList(queryFileRead) # Calculation of Inverse Document Frequency IDF = calculateIDF(dic, totalDocument) # Calculation of Term Frequency TF = calculateTFList(dic) # Calculation of Wd from all the Term Frequency calculated WD = calculateWD(TF, totalDocument) pObj = PorterStemmer() fileWrite = open("outputdocument.txt", "w") for query in queryList: fileWrite.write("\n---------------------------------------------------------------------------------------") fileWrite.write("\nQuery: " + query) # Separate the string of query into list of words listQuery = re.findall(r'\w+', query) # Remove the stopwords and numbers from the list of query words queryWithoutStopword = [x for x in listQuery if x not in stopwords and x.isalpha()] # Stem the list of query words processedQuery = [pObj.stem(x.lower(), 0, len(x) - 1) for x in queryWithoutStopword] # Calculate the cosine measure (Similarity) for the query rankedDocList = calculateSimilarity(processedQuery, IDF, WD, totalDocument) fileWrite.write("\nTotal number of documents retrieved: " + str(len(rankedDocList))) fileWrite.write("\nDocument ID:\n") fileWrite.write(''.join(str(rankedDocList))) fileWrite.write("\n---------------------------------------------------------------------------------------") fileWrite.close() print "Writing to outputdocument.txt file completes."
def buildMatrix(self): # use suffix-stripping algorithm to stem word porter_strmmer = PorterStemmer() for index in range(0,len(self.origin_documents)): document = self.origin_documents[index] # change document in origin_document array to array of stemmed word self.origin_documents[index] = [porter_strmmer.stem(x, 0, len(x) - 1) for x in document.split()] # use 2000 most frequent words to generate words array temp_word = defaultdict(int) for document in self.origin_documents: for word in document: temp_word[word] += 1 sorted_dict = sorted(temp_word.items(), key=operator.itemgetter(1)) sorted_dict.reverse() self.words = [x[0] for x in sorted_dict[0:self.word_size]] # build document array for index in range(0, len(self.origin_documents)): document = self.origin_documents[index] self.documents.append([]) self.documents[index] = [document.count(word) for word in self.words] # print(self.documents[0], sum(self.documents[0])) # remove zero sum rows zeros = [i for i, value in enumerate(self.documents) if sum(value) == 0] for value in zeros[::-1]: del self.labels[value] del self.documents[value] # zeros = [i for i, value in enumerate(self.documents) if sum(value) == 0] print(len(self.origin_documents), len(self.words), len(self.documents), self.words)
def main(): # Reading the document from the file file = open("cran.all.1400", "r") documents = file.read() # Reading stop words from the file fileStopwords = open('stopwords.txt', 'r') stopwordsList = fileStopwords.read() stopwords = stopwordsList.split() # List that maintains the document id number, number of unique terms in document, for each term in the document, its term and it's term frequency. documentList = [] docId = 1 # Splits the multiple documents of the same file into list document = re.split(".I | \n.I", documents)[1:] for doc in enumerate(document): startIndex = doc[1].index('.W\n') text = doc[1][startIndex + 3:] words = re.findall(r'\w+', text) pObj = PorterStemmer() listWords = {} for word in words: flagStopwords = word.lower() in stopwords if (not flagStopwords and word.isalpha()): stemWord = pObj.stem(word, 0, len(word) - 1) listWords = addToDict(listWords, stemWord) sortedList = sorted(listWords.items(), key=lambda t: t[0]) output = {'id': docId, 'unique': len(sortedList), 'terms': sortedList} docId += 1 documentList.append(output) for i in range(0, len(documentList)): print "Document:", documentList[i][ 'id'], "\nUnique Terms:", documentList[i][ 'unique'], "\nTerms:\n", documentList[i]['terms']
def main(): # Reading the document from the file file = open("cran.all.1400", "r") documents = file.read() # Reading stop words from the file fileStopwords = open('stopwords.txt', 'r') stopwordsList = fileStopwords.read() stopwords = stopwordsList.split() # List that maintains the document id number, number of unique terms in document, for each term in the document, its term and it's term frequency. docId = 1 #InvFileHash invFileHash = {} # Splits the multiple documents of the same file into list document = re.split(".I | \n.I", documents)[1:] for doc in enumerate(document): startIndex = doc[1].index('.W\n') text = doc[1][startIndex + 3:] words = re.findall(r'\w+', text) pObj = PorterStemmer() listWords = {} for word in words: flagStopwords = word.lower() in stopwords if (not flagStopwords and word.isalpha()): stemWord = pObj.stem(word, 0, len(word) - 1) listWords = addToDict(listWords, stemWord) docList = addDoc(docId, listWords) docId += 1 invFileHash = createInvFileHash(invFileHash, docList) print "File written: output.json" print "Number of terms:",len(invFileHash) writeToFile(invFileHash)
class Indexer(): # remove stop words and do stemming STOP_WORD_LIST = ["a","a's","able","about","above","according","accordingly","across","actually","after","afterwards","again","against","ain't","all","allow","allows","almost","alone","along","already","also","although","always","am","among","amongst","an","and","another","any","anybody","anyhow","anyone","anything","anyway","anyways","anywhere","apart","appear","appreciate","appropriate","are","aren't","around","as","aside","ask","asking","associated","at","available","away","awfully","be","became","because","become","becomes","becoming","been","before","beforehand","behind","being","believe","below","beside","besides","best","better","between","beyond","both","brief","but","by","c'mon","c's","came","can","can't","cannot","cant","cause","causes","certain","certainly","changes","clearly","co","com","come","comes","concerning","consequently","consider","considering","contain","containing","contains","corresponding","could","couldn't","course","currently","definitely","described","despite","did","didn't","different","do","does","doesn't","doing","don't","done","down","downwards","during","each","edu","eg","eight","either","else","elsewhere","enough","entirely","especially","et","etc","even","ever","every","everybody","everyone","everything","everywhere","ex","exactly","example","except","far","few","fifth","first","five","followed","following","follows","for","former","formerly","forth","four","from","further","furthermore","get","gets","getting","given","gives","go","goes","going","gone","got","gotten","greetings","had","hadn't","happens","hardly","has","hasn't","have","haven't","having","he","he's","hello","help","hence","her","here","here's","hereafter","hereby","herein","hereupon","hers","herself","hi","him","himself","his","hither","hopefully","how","howbeit","however","i'd","i'll","i'm","i've","ie","if","ignored","immediate","in","inasmuch","inc","indeed","indicate","indicated","indicates","inner","insofar","instead","into","inward","is","isn't","it","it'd","it'll","it's","its","itself","just","keep","keeps","kept","know","knows","known","last","lately","later","latter","latterly","least","less","lest","let","let's","like","liked","likely","little","look","looking","looks","ltd","mainly","many","may","maybe","me","mean","meanwhile","merely","might","more","moreover","most","mostly","much","must","my","myself","name","namely","nd","near","nearly","necessary","need","needs","neither","never","nevertheless","new","next","nine","no","nobody","non","none","noone","nor","normally","not","nothing","novel","now","nowhere","obviously","of","off","often","oh","ok","okay","old","on","once","one","ones","only","onto","or","other","others","otherwise","ought","our","ours","ourselves","out","outside","over","overall","own","particular","particularly","per","perhaps","placed","please","plus","possible","presumably","probably","provides","que","quite","qv","rather","rd","re","really","reasonably","regarding","regardless","regards","relatively","respectively","right","said","same","saw","say","saying","says","second","secondly","see","seeing","seem","seemed","seeming","seems","seen","self","selves","sensible","sent","serious","seriously","seven","several","shall","she","should","shouldn't","since","six","so","some","somebody","somehow","someone","something","sometime","sometimes","somewhat","somewhere","soon","sorry","specified","specify","specifying","still","sub","such","sup","sure","t's","take","taken","tell","tends","th","than","thank","thanks","thanx","that","that's","thats","the","their","theirs","them","themselves","then","thence","there","there's","thereafter","thereby","therefore","therein","theres","thereupon","these","they","they'd","they'll","they're","they've","think","third","this","thorough","thoroughly","those","though","three","through","throughout","thru","thus","to","together","too","took","toward","towards","tried","tries","truly","try","trying","twice","two","un","under","unfortunately","unless","unlikely","until","unto","up","upon","us","use","used","useful","uses","using","usually","value","various","very","via","viz","vs","want","wants","was","wasn't","way","we","we'd","we'll","we're","we've","welcome","well","went","were","weren't","what","what's","whatever","when","whence","whenever","where","where's","whereafter","whereas","whereby","wherein","whereupon","wherever","whether","which","while","whither","who","who's","whoever","whole","whom","whose","why","will","willing","wish","with","within","without","won't","wonder","would","would","wouldn't","yes","yet","you","you'd","you'll","you're","you've","your","yours","yourself","yourselves","zero"] def __init__(self): logging.debug('Indexer => Init params:self') self.idx_fields = [] # field of document to be indexed #self.STOP_WORD_LIST = [] self.P = PorterStemmer() # end of function ''' def set_stop_words(self,stop_word_list): self.STOP_WORD_LIST = stop_word_list # end of function ''' def set_idx_fields(self,fields): logging.debug('Indexer => set_idx_fields fields:' + str(fields)) self.idx_fields = fields def add_idx_field(self,field_name): self.idx_fields.append(field_name) def clean(self,word): #preprocess word word = word.lower() word = word.strip("\n\t,.(){}?!;'") if word not in self.STOP_WORD_LIST: word = self.P.stem(word,0,len(word)-1) else: word = "" return word # end of function def tokenize(self, text): #list word_idx = [] # split lines lines = text.split('\n') for line in lines: # split words words = line.split(' ') for word in words: word = self.clean(word) if len(word) > 1: word_idx.append(word) # make a set (remove duplicate) word_idx = set(word_idx) return word_idx # end of function def index(self, document): if isinstance(document,list): document = document[0] text = "" # get text from document to be indexed for field in self.idx_fields: text += document[field] + " " return self.tokenize(text) def stem(self, words): return [self.tokenize(word) for word in words]
def stemming(self, term): output = "" stem1 = PorterStemmer() output = stem1.stem(term) return output
def stemWord(self, word): return PorterStemmer.stem(self, word, 0, len(word) - 1)
def index_query(self, q, p_doc, s_answer): with open('posting_list.json') as posting_file: p_postlist = json.load(posting_file) with open('dictionary.json') as dict_file: p_dict = json.load(dict_file) self.query = q self.parsed_doc = p_doc self.parsed_postlist = p_postlist self.parsed_dicionary = p_dict self.stem_answer = s_answer index_dict = {} first_item = [] abstract = '' summary = '' abstract_list = [] ps = PorterStemmer() if self.stem_answer == 'no': stem_query = self.query if self.stem_answer == 'yes': stem_query = ps.stem(self.query, 0, len(self.query) - 1) for term, value in self.parsed_postlist.items(): if stem_query == term: if isinstance(value, dict): for doc_id, val in value.items(): if term not in index_dict: index_dict[term] = {} index_dict[term]['documents'] = [] index_dict[term]['summary'] = '' if self.query in self.parsed_doc[doc_id][ 'abstract'] and len(first_item) == 0: first_item.append( self.parsed_doc[doc_id]['abstract']) abstract = first_item[0] abstract_list = abstract.split() summary = abstract_list[abstract_list.index( self.query)] count = 0 while count < 10: if abstract_list.index( self.query) + count < len( abstract_list): index_dict[term][ 'summary'] += abstract_list[ abstract_list.index(self.query) + count] + ' ' else: break count += 1 if doc_id not in index_dict[term]: index_dict[term][doc_id] = {} index_dict[term][doc_id]['position'] = val[ 'position'] index_dict[term][doc_id]['term_frequency'] = val[ 'frequency'] index_dict[term][doc_id][ 'title'] = self.parsed_doc[doc_id]['title'] index_dict[term][ 'doc_frequency'] = self.parsed_dicionary[term] index_dict[term]['documents'].append(doc_id) pprint.pprint(index_dict)
def preprocessing(file_string): try: f = open(file_string) email_contents = f.read() # # #2 去标题 # cut = re.search(r'\n[ \t]*\n',email_contents).span()[1] - 1 # \n...\n # cut ↑ email_contents = email_contents[cut:] # # #3 其他预处理 # #小写 √ email_contents = email_contents.lower() #删HTML标签 √ # email_contents = re.sub(r'<[^<>]+>',' ',email_contents); # URL链接替换 → httpaddr √ # http//https :// xx.xxx.xxx email_contents = re.sub(r'http\:\/\/[\w\.\/]+|https\:\/\/[\w\.\/]+','httpaddr',email_contents) # # URL邮件地址替换 → emailaddr # email_contents = re.sub(r'[\w\-\_]+\@[\w]+\.[\w]+','emailaddr',email_contents) # 数字替换 → number √ # 整数 小数 email_contents = re.sub(r'(\d+|\d+\.\d+)','number',email_contents) # 美元替换 $ → dollar √ # email_contents = re.sub(r'\$','dollar',email_contents) # 单词词干化 # 实现复杂 # 去除非单词和标点 # email_contents = re.sub(r'[\W]',' ',email_contents) # 删去单字母 for i in range(50): email_contents = re.sub(r' [a-z] ',' ',email_contents) # 删去多余空格 email_contents = re.sub(r'[\t\n ]+',' ',email_contents) # # from stemmer import PorterStemmer # stemmer = PorterStemmer() # from stemmer import PorterStemmer stemmer = PorterStemmer() email_contents = stemmer.stem(email_contents) # word_list = re.findall(r'\w+',email_contents) # return word_list for word in word_list: if(word_frequency.get(word,"None") == "None"): word_frequency[word] = 1; else: word_frequency[word] += 1; # return word_frequency except: print (file_string + " ERROR") pass
def stemWords(input): stem = PorterStemmer() for index,entries in enumerate(input): input[index]=stem.stem(entries,0,len(entries)-1) return input
total_files = 0 p = PorterStemmer() if not os.path.exists(corpus_dir): os.makedirs(corpus_dir) os.chdir(data_dir) for file in glob.glob('*.sgm'): current_file = os.path.join(data_dir, file) print 'Extract files from file %s' % current_file soup = BeautifulSoup(open(current_file)) for document in soup.find_all('reuters'): new_file = os.path.join(corpus_dir, document.get('newid')) with open(new_file, "wb") as extracted_file: read_data = document.get_text().encode('utf-8') clean_data = re.sub(r'/', ' / ', read_data) clean_data = re.sub(r'-', ' - ', clean_data) """ The punctuations contained in the string.punctuation are !"#$%&'()*+,-./:;<=>?@[\]^_`{|}~ """ clean_data = read_data.translate(None, string.punctuation) clean_data = clean_data.lower() output = '' for word in clean_data.split(): output += p.stem(word, 0, len(word) - 1) output += ' ' extracted_file.write(output) total_files += 1 print 'Total files extracted %s' % total_files
def run_train_test(training_file, testing_file): # Set the variables, params, dicts, sets alpha = 0.5 stop_words = {'the', 'and'} logic_negation = {'t', 'not', 'no', 'never', 'dont', 'didnt', 'doesnt'} Porter_Stemmer = PorterStemmer() # Import training dataset training_start_time = time.time() vocab = set(['positive-words', 'negative-words']) wordcount_class_0 = {'positive-words': 0, 'negative-words': 0} wordcount_class_1 = {'positive-words': 0, 'negative-words': 0} total_reviews = 0 reviewscount_0 = 0 reviewscount_1 = 0 train_labels = [] train_reviews = [] with training_file as f: for line in f: review, label = line.split(',') words = review.split(' ') del words[-1] label = int(label.strip("\n")) total_reviews += 1 # Implement negation: add NOT_ to words after logical negation for i in range(len(words)): words[i] = Porter_Stemmer.stem(words[i]) if words[i] in logic_negation: try: words[i + 1] = 'NOT_' + words[i + 1] except: continue try: words[i + 2] = 'NOT_' + words[i + 2] except: continue try: words[i + 3] = 'NOT_' + words[i + 3] except: continue bigrams = [] for i in range(len(words) - 1): bigram = words[i] + ' ' + words[i + 1] bigrams.append(bigram) words = set(bigrams) # words = set(words) vocab.update(words) for word in words: if word not in wordcount_class_0.keys(): wordcount_class_0[word] = 0 wordcount_class_1[word] = 0 if label == 0: reviewscount_0 += 1 for word in words: wordcount_class_0[word] += 1 # # Analyze Sentiment lexicons # unigram1, unigram2 = word.split(' ') # if unigram1 in lexicons.pos_words: # wordcount_class_0['positive-words'] += 1 # if unigram1 in lexicons.neg_words: # wordcount_class_0['negative-words'] += 1 # if unigram2 in lexicons.pos_words: # wordcount_class_0['positive-words'] += 1 # if unigram2 in lexicons.neg_words: # wordcount_class_0['negative-words'] += 1 if label == 1: reviewscount_1 += 1 for word in words: wordcount_class_1[word] += 1 # # Analyze Sentiment lexicons # unigram1, unigram2 = word.split(' ') # if unigram1 in lexicons.pos_words: # wordcount_class_0['positive-words'] += 1 # if unigram1 in lexicons.neg_words: # wordcount_class_0['negative-words'] += 1 # if unigram2 in lexicons.pos_words: # wordcount_class_0['positive-words'] += 1 # if unigram2 in lexicons.neg_words: # wordcount_class_0['negative-words'] += 1 train_labels.append(label) train_reviews.append(words) # Compute CPTs P_class = [0, 0] P_class[0] = reviewscount_0 / total_reviews P_class[1] = reviewscount_1 / total_reviews P_words_class_0 = {} P_words_class_1 = {} bottom_0 = sum(wordcount_class_0.values()) + alpha * len(vocab) bottom_1 = sum(wordcount_class_1.values()) + alpha * len(vocab) for word in vocab: if word in stop_words: P_words_class_0[word] = (0 + alpha) / bottom_0 P_words_class_1[word] = (0 + alpha) / bottom_1 else: P_words_class_0[word] = (wordcount_class_0[word] + alpha) / bottom_0 P_words_class_1[word] = (wordcount_class_1[word] + alpha) / bottom_1 # Inference on the training dataset predict_train_labels = [] for doc in train_reviews: log_sum_0 = 0 log_sum_1 = 0 bag_of_words = set(doc) for word in bag_of_words: log_sum_0 += log(P_words_class_0[word]) log_sum_1 += log(P_words_class_1[word]) # # Sentiment Analysis # unigram1, unigram2 = word.split(' ') # if unigram1 in lexicons.pos_words: # log_sum_0 += log(P_words_class_0['positive-words']) # log_sum_1 += log(P_words_class_1['positive-words']) # if unigram1 in lexicons.neg_words: # log_sum_0 += log(P_words_class_0['negative-words']) # log_sum_1 += log(P_words_class_1['negative-words']) # if unigram2 in lexicons.pos_words: # log_sum_0 += log(P_words_class_0['positive-words']) # log_sum_1 += log(P_words_class_1['positive-words']) # if unigram2 in lexicons.neg_words: # log_sum_0 += log(P_words_class_0['negative-words']) # log_sum_1 += log(P_words_class_1['negative-words']) Prob_c0 = log(P_class[0]) + log_sum_0 Prob_c1 = log(P_class[1]) + log_sum_1 if Prob_c0 > Prob_c1: c = 0 else: c = 1 predict_train_labels.append(c) # Compute training accuracy correct = 0 for i in range(len(train_labels)): if predict_train_labels[i] == train_labels[i]: correct += 1 train_accuracy = correct / len(train_labels) training_time = time.time() - training_start_time # Import testing dataset testing_start_time = time.time() test_reviews = [] test_labels = [] with testing_file as f: for line in f: review, label = line.split(',') words = review.split(' ') del words[-1] label = int(label.strip("\n")) # Implement negation: add NOT_ to words after logical negation for i in range(len(words)): words[i] = Porter_Stemmer.stem(words[i]) if words[i] in logic_negation: try: words[i + 1] = 'NOT_' + words[i + 1] except: continue try: words[i + 2] = 'NOT_' + words[i + 2] except: continue try: words[i + 3] = 'NOT_' + words[i + 3] except: continue bigrams = [] for i in range(len(words) - 1): bigram = words[i] + ' ' + words[i + 1] bigrams.append(bigram) words = set(bigrams) # words = set(words) test_labels.append(label) test_reviews.append(words) # Inference on the testing dataset predict_test_labels = [] for doc in test_reviews: log_sum_0 = 0 log_sum_1 = 0 bag_of_words = set(doc) bag_of_words = vocab.intersection(bag_of_words) for word in bag_of_words: log_sum_0 += log(P_words_class_0[word]) log_sum_1 += log(P_words_class_1[word]) # # Sentiment Analysis # unigram1, unigram2 = word.split(' ') # if unigram1 in lexicons.pos_words: # log_sum_0 += log(P_words_class_0['positive-words']) # log_sum_1 += log(P_words_class_1['positive-words']) # if unigram1 in lexicons.neg_words: # log_sum_0 += log(P_words_class_0['negative-words']) # log_sum_1 += log(P_words_class_1['negative-words']) # if unigram2 in lexicons.pos_words: # log_sum_0 += log(P_words_class_0['positive-words']) # log_sum_1 += log(P_words_class_1['positive-words']) # if unigram2 in lexicons.neg_words: # log_sum_0 += log(P_words_class_0['negative-words']) # log_sum_1 += log(P_words_class_1['negative-words']) Prob_c0 = log(P_class[0]) + log_sum_0 Prob_c1 = log(P_class[1]) + log_sum_1 if Prob_c0 > Prob_c1: c = 0 else: c = 1 # print(c) predict_test_labels.append(c) # Compute testing accuracy correct = 0 for i in range(len(test_labels)): if predict_test_labels[i] == test_labels[i]: correct += 1 test_accuracy = correct / len(test_labels) # Print results testing_time = time.time() - testing_start_time print(round(training_time), 'seconds (training)') print(round(testing_time), 'seconds (labeling)') print(round(train_accuracy, 3), '(training)') print(round(test_accuracy, 3), '(testing)') print(len(vocab)) return
'file' : { 'ls-time': 1 }, 'ls' : { 'ls-time': 1, 'tag-time': 1 }, 'refspec': { 'new-branch-push': 2 } } computed = {} stemmer = PorterStemmer() for word in sys.argv[1:]: word = stemmer.stem(word.lower()) print word if word in synonymMap: word = synonymMap[word] if word in weightMap: for key, value in weightMap[word].iteritems(): if key in computed: computed[key] += value else: computed[key] = value sorted_computed = sorted(computed.iteritems(), key=operator.itemgetter(1)) sorted_computed.reverse()
def stem_word(sentence): for i in range(len(sentence)): p = PorterStemmer() #sentence[i] = sentence[i].lower() sentence[i] = p.stem(sentence[i], 0, len(sentence[i]) - 1) return sentence
def GetFeatures(emails): #np.random.shuffle(emails) word_mapping = GetWordDictionary(emails) # with open("balanced_chains_shuffed.pickle", "wb") as fp2: # pickle.dump(emails, fp2, protocol=pickle.HIGHEST_PROTOCOL) stemmer = PorterStemmer() training_cutoff = int(len(emails) * 0.9) x_vals = [] y_vals = [] count_0 = 0 count_1 = 0 for i in range(0, len(emails)): #print "Evaluation Email %d" % (i) email, next_email, time_diff, bucket = emails[i] #, bucket if (float(np.round(time_diff / 60)) > 24): continue num_features = 9 #15 #13 num_words = 0 #len(word_mapping) # Create feature array features = np.zeros(shape=(num_features + num_words)) #Feature 1: Number of to features[0] = float(len(email['to'])) # Feature 2: Num words words = email['body'].split() lower_case_body = [ stemmer.stem(x.lower(), 0, len(x) - 1) for x in words ] features[1] = float(len(words)) # print email # print bucket # print lower_case_body # print "-------------\n\n" #Feature 3: Number of CC # Old: 0.5442, 0.5387 features[2] = float(email['cc_count']) # Feature 4: is reply # without, .6298, with. .5994 if email['is_re']: features[3] = 1.0 else: features[3] = 0.0 # Feature 5: Time of Day (hour) #print email date = email['date']['local_date'] hour = date.hour # Old, 0.5442, New = 0.5276 features[4] = float(hour) #Feature 6: Length of Subject Line subject_words = email['subject'].split() lower_case_words = [ stemmer.stem(x.lower(), 0, len(x) - 1) for x in subject_words ] features[5] = int(len(subject_words)) # Feature 7: Day of Week # without, 0.5580 features[6] = (date.weekday()) # Feature 8: Question marks in Body features[7] = (email['body'].count('?')) # Feature 9: Question marksin Subject features[8] = (email['subject'].count('?')) # # Feature 10: "RESPONSE NEEDED" in subject # stemmed_response = stemmer.stem("response", 0, len("response") - 1) # if stemmed_response in lower_case_words: # features[9] = 1 # else: # features[9] = 0 # # Feature 11: "Please" in words # stemmed_please = stemmer.stem("please", 0, len("please") - 1) # if stemmed_please in lower_case_words: # features[10] = 1 # else: # features[10] = 0 # if email['body'].find('?') != -1: # features[11] = 1 # else: # features[11] = 0 # if email['subject'].find('?') != -1: # features[12] = 1 # else: # features[12] = 0 # stemmed_can = stemmer.stem("can", 0, len("can") - 1) # if stemmed_can in lower_case_words: # features[13] = 1 # else: # features[13] = 0 # stemmed_important = stemmer.stem("important", 0, len("important") - 1) # if stemmed_important in lower_case_words: # features[14] = 1 # else: # features[14] = 0 # if stemmed_important in lower_case_body: # features[15] = 1 # else: # features[15] = 0 # stemmed_urgent = stemmer.stem("urgent", 0, len("urgent") - 1) # if stemmed_urgent in lower_case_words: # features[16] = 1 # else: # features[16] = 0 # if stemmed_urgent in lower_case_body: # features[17] = 1 # else: # features[17] = 0 # stemmed_need = stemmer.stem("need", 0, len("need") - 1) # if stemmed_need in lower_case_words: # features[18] = 1 # else: # features[18] = 0 # if stemmed_need in lower_case_body: # features[19] = 1 # else: # features[19] = 0 # body = SplitText(email['body']) # for word in body: # modified_word = word # if len(modified_word) > 1: # modified_word = stemmer.stem(word, 0, len(word) - 1).lower() # if modified_word in word_mapping: # features[word_mapping[modified_word]] = 1 x_vals.append(features) y_vals.append(bucket) #y_vals.append(float(np.round(time_diff / 60))) #Append y_value for training point # if float(time_diff) > 60.0: # y_vals.append(1) # count_1 += 1 # else: # y_vals.append(0) # count_0 += 1 #y_vals.append(float(np.round(time_diff / 60))) #print count_0 #print count_1 return x_vals, y_vals
class Parse: def __init__(self): self.stop_words = stopwords.words('english') self.dictionary_term_index = {} self.array_names_and_entities = {} self.porter_stemmer = PorterStemmer() def parse_sentence(self, text, stemmer=False): """ This function tokenize, remove stop words and apply lower case for every word within the text :param text: :return: """ list_percent = ["percent", "Percent", "Percentage", "percentage"] self.array_names_and_entities = {} self.dictionary_index = {} text = text.replace("\n", ". ") text = self.ignore_emojis(text) array_text_space = text.split(" ") array_text_space = self.separate_words_with_dots(array_text_space) string_ans = "" array_size = range(len(array_text_space)) string_ans_index = 0 entities_url = [] # help us to replace the url to "" because in get_entities it returns parts of the url for word, idx in zip(array_text_space, array_size): ans = "" if word == '' or word == ' ': continue check_digit = self.isdigit(word) if len(word) < 2 and check_digit is False: continue if len(word) < 2 or self.is_ascii(word) is False: if check_digit is False: word = self.remove_panctuation(word) if self.is_ascii(word) is False or word == '' or word == " " or len( word) < 2 or word.lower() not in self.stop_words: continue if ans == "" and self.is_url(word): entities_url.append(word) if "t.co" in word: continue ans = self.parse_url(word) if ans == "": entities_url.remove(word) continue else: if ans == "" and len(word) < 2 and word[0] != '#' and self.is_ascii(word) and not self.isfloat(word): word = self.remove_panctuation(word) if ans == "" and word[0] == '#': temp_word = self.remove_panctuation(word) if temp_word == "" or temp_word == "#": continue ans = self.parse_hashtag(temp_word) elif ans == "" and word[0] == '@': ans = self.remove_panctuation(word) elif ans == "" and word in list_percent: if idx > 0 and self.isfloat(array_text_space[idx - 1]): ans = self.parse_percentage(array_text_space[idx - 1] + " " + word) string_ans = string_ans[:len(string_ans) - 1 - len(ans)] + string_ans[ len(string_ans) + len(word):] + " " else: ans = word elif ans == "" and (word.lstrip('-').isdigit() or self.isfloat(word.lstrip('-')) or self.isFraction( word.lstrip('-')) or word.replace('~', '').isdigit()): ans = self.convert_str_to_number(array_text_space, idx) if ans == "": pre_ans = self.remove_panctuation(word) if len(pre_ans) < 2: continue array_ans = pre_ans.split() for word_array in array_ans: if word_array.lower() in self.stop_words: continue string_ans += self.add_to_dictionary(word_array.lower(), string_ans_index) string_ans_index += len(word) + 1 else: string_ans += self.add_to_dictionary(ans, string_ans_index) string_ans_index += len(word) + 1 self.get_name_and_entities(entities_url, array_text_space) array_parsed = string_ans.split() return array_parsed, self.array_names_and_entities def separate_words_with_dots(self, array_text): new_text = "" length = range(len(array_text)) for i in length: word = array_text[i] if '.' not in word: new_text += word + " " continue if "http" in word or "www" in word or "t.co" in word or self.isfloat(word): new_text += word + " " continue separate = str(word).split('.') new_text += separate[0] + ". " + separate[1] + " " return new_text.lstrip().split(" ") def is_url(self, text): ''' check if string is a url path :param text: url :return: boolean ''' regex = re.compile( r'^(?:http|ftp)s?://|(?:www)?.' # http:// or https:// r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|' # domain... r'localhost|' # localhost... r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...or ip r'(?::\d+)?' # optional port r'(?:/?|[/?]\S+)$', re.IGNORECASE) return re.match(regex, text) is not None def add_to_dictionary(self, text, index): array_of_words = text.split(" ") ans = "" for word in array_of_words: ans += word + " " self.dictionary_index[word] = index if ans == "": return "" return ans def parse_hashtag(self, phrase): """" parser hash tag and lower the letters return array of string #stayAtHome -> ['#stayathome',stay,at,home] """ original_phrase = phrase pattern = re.compile(r"[A-Z][a-z]+|\d+|[A-Z]+(?![a-z])") if phrase[1].islower() and '_' not in original_phrase: phrase = phrase[:1] + phrase[1].upper() + phrase[2:] temp = pattern.findall(phrase) all_words = phrase[1:].split("_") for word in all_words: if word != phrase[1:] and word.lower() and word not in temp: temp.append(word) temp = [str_to_lower.lower() for str_to_lower in temp] temp.insert(0, original_phrase[0:len(original_phrase)].lower().replace('_', '')) i=0 len_temp =len(temp) while i < len_temp: if temp[i] in self.stop_words or len(temp[i]) < 2: temp[i] = '' i += 1 # for word in temp: # if word in self.stop_words or len(word) < 2: # temp.remove(word) return " ".join(temp).lstrip().rstrip() def parse_url(self, string): """ parsing url path return an array of the components """ if string is not None: ans = string.split("/") #r = re.split('[/://?=-]', string) #ans = " ".join(r).lstrip() #for term in ans: # term = re.sub(r"http\S+", "", ans) #ans = "".join(ans).strip().split() ans_len = len(ans) remove_www = "" if ans_len > 0: for term in ans: remove_www += term.replace("www.", "") + " " ans[0] = ans[0].replace(ans[0], remove_www) string_without_stopword = "" length = range(len(ans)) ans_string = ans[0].split(" ") for word, idx in zip(ans_string, length): if word == '' or word == ' ': continue if len(word) < 2 or (len(word) > 0 and word[0] == '#'): continue if word not in self.stop_words or word.isnumeric(): if not self.is_url(word): word = self.remove_panctuation(word) string_without_stopword += word + " " return string_without_stopword.lstrip() else: return "" def isdigit(self, word): if "0" <= word <= "9": return True return False def isfloat(self, value): """ check if value is a float number :return: boolean """ try: float(value) return True except ValueError: return False def isFraction(self, token): """ check if value is a fraction number :return: boolean """ if '/' not in token: return False values = token.split('/') return all(i.isdigit() for i in values) def convert_str_to_number_kmb(self, word): """ check if value is a float number, and return the wanted number. etc: 1000->1K, 1013456->1.013M :return: boolean """ tmb = '' if word >= 1000000000 or word <= -1000000000: word = float(word / 1000000000) tmb = 'B' elif word >= 1000000 or word <= -1000000: word = float(word / 1000000) tmb = 'M' elif word >= 1000 or word <= -1000: word = float(word / 1000) tmb = 'K' ans = '{:0.3f}'.format(word) return '{0:g}'.format(float(ans)) + tmb def convert_str_to_number(self, text_demo, idx): """ check every type of number and return it as a string. etc: 1K,1M,1B,-900,23/5,2020,2K :return: boolean """ help_minus = '' text_return = [] my_word = text_demo[idx] text_demo_length = len(text_demo) my_word = my_word.replace(",", "") if re.search('-', my_word): help_minus = '-' my_word = my_word.replace("-", "") if not self.isfloat(my_word): my_word = self.remove_panctuation(my_word) if self.isFraction(my_word): if idx + 1 == text_demo_length: return ''.join(help_minus + my_word) text_return = ''.join(help_minus + my_word) token_next = text_demo[idx + 1].lower() if token_next == "billion" or token_next == "billions": text_return += 'B' text_demo[idx + 1] = "" if token_next == "million" or token_next == "millions": text_return += 'M' text_demo[idx + 1] = "" if text_demo[idx + 1] == "thousand" or token_next == "thousands": text_return += 'K' text_demo[idx + 1] = "" return help_minus + ''.join(text_return) if my_word != '' and not math.isnan(float(my_word)): number = float(my_word) number_numerize = self.convert_str_to_number_kmb(number) if idx + 1 < len(text_demo): token_next = text_demo[idx + 1].lower() number_to_input = str(number_numerize) if token_next == "billion" or token_next == "billions": if 'K' in number_numerize or 'M' in number_numerize: number_to_input = (number_to_input.translate({ord('K'): None})) number_to_input = (number_to_input.translate({ord('M'): None})) text_return.append(my_word) else: text_return.append(str(number_numerize + 'B')) text_demo[idx + 1] = "" elif token_next == "million" or token_next == "millions": if 'K' in number_numerize: number_to_input = (number_to_input.translate({ord('K'): None})) text_return.append(number_to_input + 'B') else: number_to_input = str(number_numerize) text_return.append(number_to_input + 'M') text_demo[idx + 1] = "" elif token_next == "thousand" or token_next == "thousands": if 'K' in number_numerize: number_to_input = (number_to_input.translate({ord('K'): None})) text_return.append(number_to_input + 'M') elif 'M' in number_numerize: number_to_input = (number_to_input.translate({ord('M'): None})) text_return.append(number_to_input + 'B') else: text_return.append(number_to_input + 'K') text_demo[idx + 1] = "" elif 1000 > number > -1000: text_return.append(number_numerize) else: text_return.append(number_numerize) else: text_return.append(number_numerize) if 1900 < number < 2100 and help_minus == '': if '~' in text_demo[idx]: text_return.append(my_word) else: text_return.append(text_demo[idx]) return help_minus + ' '.join(text_return) def ignore_emojis(self, text): emoji_pattern = re.compile("[" u"\U0001F600-\U0001F64F" # emoticons u"\U0001F300-\U0001F5FF" # symbols & pictographs u"\U0001F680-\U0001F6FF" # transport & map symbols u"\U0001F1E0-\U0001F1FF" # flags (iOS) u"\U00002500-\U00002BEF" # chinese char u"\U00002702-\U000027B0" u"\U00002702-\U000027B0" u"\U000024C2-\U0001F251" u"\U0001f926-\U0001f937" u"\U00010000-\U0010ffff" u"\u2640-\u2642" u"\u2600-\u2B55" u"\u200d" u"\u23cf" u"\u23e9" u"\u231a" u"\ufe0f" # dingbats u"\u3030" "]+", flags=re.UNICODE) ans = emoji_pattern.sub(r'', text) return ans def is_ascii(self, s): ans = all(ord(c) < 128 or c == '…' or c == '’' or c == '³' or c == "¹⁹" for c in s) return ans def parse_percentage(self, string): """ change word to percent 100 percent -> 100% :param string: string to check if there is a percent within :return: array of converted strings """ return re.split('\s+', string)[0] + '%' def remove_panctuation(self, word): """ remove pancuations from word (like . or , or : ) :param word :return: word without panctuation """ # chars = set('.,:;!()[]{}?=+…$&') if re.match(r'[^@]+@[^@]+\.[^@]+', word): return word if "#" == word or "##" == word: return "" if word[-2:] == "'s" or word[-2:] == "’s" or word[-2:] == "`s": word = word.replace(word[-2:], "") smiles = [":)", ":(", ":-]", ":-)", ";)", ";-)", ":-(", ";(", ";-(", ":-P", ":P", ":p", ":-p"] for smile in smiles: if smile in word: word = word.replace(smile, "") if word in smiles: return '' if "\n" in word: word = word.replace("\n", " ") if '#' in word and word[0] != '#': word = word.replace("#", "") if '_' in word and '#' not in word: word = word.replace("_", "") if '@' in word and word[0] != '@': word = word.replace("@", "") word = word.replace("-", " ") word = word.replace("'", "") word = re.sub(r'[€£€4️⃣“”‘‼⑥²⁸¹❶❷❽②⑦&$~’.,!…|?,…:;^"{}*=+()⁰\/[\[\]]', '', word) return word def get_name_and_entities(self, entities_url, array_text_space): text = "" for word in array_text_space: if word == '' or word == '' or word[0] == '@' or word[0] == '#' or word == "RT": continue text += word + " " rx2 = re.compile(r'[A-Z][-a-zA-Z]+[1-9]*(?:\s+[A-Z][-a-zA-Z]+[1-9]*)*') matches = rx2.findall(text) tokinzed_entity_new = set() i = 0 for i in range(len(matches)): if len(str(matches[i]).split()) > 1: tokinzed_entity_new.add(str(matches[i])) i += 1 if "COVID 19" in text: tokinzed_entity_new.add("COVID 19") if "Covid 19" in text: tokinzed_entity_new.add("Covid 19") for word in tokinzed_entity_new: if word.lower() not in self.stop_words: all_places = [m.start() for m in re.finditer(word, text)] self.array_names_and_entities[word] = all_places return tokinzed_entity_new def parse_doc(self, doc_as_list,stemmer=False): """ This function takes a tweet document as list and break it into different fields :param doc_as_list: list re-preseting the tweet. :return: Document object with corresponding fields. """ tweet_id = doc_as_list[0] tweet_date = doc_as_list[1] full_text = doc_as_list[2] url = doc_as_list[3] indices = doc_as_list[4] retweet_text = doc_as_list[5] retweet_url = doc_as_list[6] retweet_indices = doc_as_list[7] quote_text = doc_as_list[8] quote_url = doc_as_list[9] quote_indices = doc_as_list[10] term_dict = {} entities_local_dict = {} array_url_parsed = [] url = str(url) rt = False if "RT" in full_text: rt = True if url != "{}" and "null" not in url: dict2 = eval(url) keys = dict2.keys() for key in keys: if dict2[key] != str("null") and "t.co" not in dict2[key]: url_parsed = self.parse_url(dict2[key]) check = url_parsed.split() for word in check: array_url_parsed.append(word) tokenized_text, names_and_entities = self.parse_sentence(full_text, stemmer=False) doc_length = len(tokenized_text) # after text operations. if doc_length == 0: return None for term in tokenized_text: if len(term) < 2: continue if stemmer: term = self.porter_stemmer.stem(term) if term not in term_dict.keys(): term_dict[term] = 1 else: term_dict[term] += 1 for term in array_url_parsed: if len(term) < 2: continue if stemmer: term = self.porter_stemmer.stem(term) if term.lower() in self.stop_words or term == 'http' or term == 'https' or term == 'www': continue if term not in term_dict.keys(): term_dict[term] = 1 else: term_dict[term] += 1 for term in names_and_entities.keys(): if len(term) < 2: continue if term in self.stop_words: continue if term not in term_dict.keys(): term_dict[term] = 1 else: term_dict[term] += 1 document = Document(tweet_id, tweet_date, full_text, url, retweet_text, retweet_url, quote_text, quote_url, term_dict, len(self.array_names_and_entities), rt, doc_length) return document
def stem(word): p = PorterStemmer() return p.stem(word, 0, len(word)-1)
def stemWord(self, word): return PorterStemmer.stem(self, word, 0, len(word) - 1)
fout=open(boffile, 'w') wordDict=defaultdict(int) wordDict.clear() #go through all tweets and count for the number of each term while True: line = fin.readline() if not line: break text = Tparse.GetText(line.strip()) tokens = re.split('\s+|_', text) for token in tokens: word = re.sub("[^a-z]", "", token.lower()) word = stemmer.stem(word) if len(word)>0: wordDict[word] += 1 #calculate frequency and output result output_vector = [] total = sum([item for index, item in wordDict.items()]) output_vector = [(index, float(val)/float(total)) for index, val in wordDict.items()] output_vector = sorted(output_vector, key=lambda item:item[0]) for index, value in output_vector: fout.write("%s %s\n" % (index, value)) fout.close()
class pre_worker(object): r_stopwords = None #停用词表 c_stopwords_lock = threading.Lock() def __init__(self,name,task_queue,result_queue): self.name = name self.r_stemmer = PorterStemmer() self.queue = task_queue self.queue2 = result_queue #载入停词表 pre_worker.load_stopwords() def work(self): process_print(u'%s开始工作' % (self.name,)) self.start_time = time.clock() count = 0 while 1: content_list = self.queue.get() #从任务队列里取出n篇文章的所有内容 if content_list == None: self.queue2.put(None) break count +=len(content_list) for content in content_list: result_list = self.__parse_doc(content) self.queue2.put(result_list) #将结果存入输出队列中 process_print(u' %s已退出,共处理了%d个任务' % (self.name,count)) @staticmethod def load_stopwords(): if pre_worker.r_stopwords == None: pre_worker.c_stopwords_lock.acquire() try: f = open('stopwords.txt') content = f.readline().split(',') f.close() pre_worker.r_stopwords = set(content) #print u'停词表载入成功',len(content) finally: pre_worker.c_stopwords_lock.release() def __parse_doc(self,content): items = re.findall(setting.c_reg_doc,content) result_list = [] for item in items: #获得doc的name result = re.search(setting.c_reg_doc_name,item) if result == None: continue name = result.group(1).strip() #获得doc的content result = re.search(setting.c_reg_doc_text,item) if result == None: continue text_content = result.group(1) term_list = self.__parse_text(text_content) if len(term_list) == 0: continue result_list.append((name,' '.join(term_list))) return result_list def __parse_text(self,text_content): #转换文档正文,分词 begin = 0 term_list = [] for index in range(0,len(text_content)): if text_content[index] not in setting.c_valid_chars: #词的长度要大于等于最小长度setting.c_min_length if index - begin < setting.c_min_length: begin = index + 1 continue content = text_content[begin:index].lower() #获取词根 content = self.r_stemmer.stem(content,0,len(content) - 1) begin = index + 1 #变为词根之后如果长度太小,依然舍弃 if len(content) < setting.c_min_length: continue #去停用词 if content in pre_worker.r_stopwords: continue term_list.append(content) return term_list def parse_topic(self,filename): content = get_all_text(filename) items = re.findall(setting.c_reg_topic,content) result_list = [] for item in items: result = re.search(setting.c_reg_topic_id,item) if result == None: continue id = result.group(1) index = item.index('<title>') text_content = re.sub(setting.c_reg_sub,' ',item[index:]) term_list = self.__parse_text(text_content) if len(term_list) == 0: continue result_list.append((id,' '.join(term_list))) return result_list
def GetDataset(): np.random.shuffle(emails) x_vals = [] y_vals = [] stemmer = PorterStemmer() word_mapping = GetWordDictionary(emails) i = 0 text_data = [] for i in range(0, len(emails)): #print "Evaluation Email %d" % (i) # note: time diff in mins email, next_email, time_diff, label = emails[i] # Create feature array features = [] #Feature 1: Number of to features.append(len(email['to'])) # Feature 2: Num words words = email['body'].split() lower_case_body = [ stemmer.stem(x.lower(), 0, len(x) - 1) for x in words ] features.append(len(words)) # Feature 3: Number of CC features.append(email['cc_count']) # Feature 4: is reply if email['is_re']: features.append(1) else: features.append(0) # Feature 5: Time of Day (hour) date = email['date']['local_date'] hour = date.hour features.append(hour) # Feature 6: Length of Subject Line subject_words = email['subject'].split() lower_case_subject = [ stemmer.stem(x.lower(), 0, len(x) - 1) for x in subject_words ] features.append(len(subject_words)) # Feature 7: Day of Week features.append(date.weekday()) # Feature 8: # Question marks in Body, bool in body features.append(email['body'].count('?')) # Feature 9: # Question marks in Subject, bool in subject features.append(email['subject'].count('?')) # NEW FEATURES # boolean: presence of ? in body / header features.append(1 if '?' in email['body'] else 0) features.append(1 if '?' in email['subject'] else 0) # Feature 12-13: "RESPONSE NEEDED" in subject or body keywords = ['response', 'please', 'can', 'urgent', 'important', 'need'] for keyword in keywords: stemmed_keyword = stemmer.stem(keyword, 0, len(keyword) - 1) features.append(1 if stemmed_keyword in lower_case_subject else 0) features.append(1 if stemmed_keyword in lower_case_body else 0) x_vals.append(features) y_vals.append(label) X = np.array(x_vals) Y = np.array(y_vals) return X, Y
for document, count in pairwise(documents): dictionary[document] = int(count) count_term.update({entry[0]: int(entry[1])}) index.update({entry[0]: dictionary}) with open(lengths_file, "r") as lengths: documents_lengths = map(lambda line: line.strip(), lengths.readlines()) documents_lengths = [int(length) for length in documents_lengths] corpus_length = documents_lengths[len(documents_lengths) - 1] del documents_lengths[len(documents_lengths) - 1] for term in query: term = term.translate(None, string.punctuation) if term not in stopwords: inner_query.append(p.stem(term.lower(), 0, len(term) - 1)) documents_count = len(documents_lengths) query_terms_count = len(inner_query) heap = [] probabilities = [[1 for x in xrange(documents_count)] for x in xrange(query_terms_count)] results = [1 for x in xrange(documents_count)] for i, term in enumerate(inner_query): try: term_collection_frequency = count_term[term] / corpus_length except KeyError: # the term doesn't appear in the collection continue for document in xrange(1, documents_count + 1): probabilities[i][document - 1] = lambda_ * (index[term][str(document)] / documents_lengths[document - 1] + term_collection_frequency)