def stem_words(self, terms): """ Remove the suffixes in terms. """ porter_stemmer = PorterStemmer() #we use the porter stemming algoritm terms = [porter_stemmer.stem(word) for word in terms] return terms
def main(argv): files = os.listdir(sys.argv[1]) file = files[0] stemmed = [] for file in files: text = "" infile = open(sys.argv[1] + file) a = infile.readline() while a: text += removeSGML(a) a = infile.readline() tok = tokenizeText(text) removed = removeStopwords(tok) from porter import PorterStemmer p = PorterStemmer() for element in removed: stemmed.append(p.stem(element, 0, len(element) - 1)) print "Words " + str(len(stemmed)) unique = list(set(stemmed)) print "Vocabulary " + str(len(unique)) wordfrequency = [(unique[x], stemmed.count(unique[x])) for x in range(0, len(unique))] sort = sorted(wordfrequency, key=getKey, reverse=True) for i in range(0, 49): print sort[i]
def stemWords(input): porter = PorterStemmer() words = input for index, word in enumerate(words): words[index] = porter.stem(word, 0, len(word) - 1) return words
def __init__(self, language='en', action='tfidf'): # Вызываем LoadExternalLists, создаем список стоп-слов, # загружаем немецкий лексикон, # self.language = language self.action = action # знаки, которые будут удаляться в начале и конце токена self.punctuation = "∙!‼¡\"#£€$¥%&'()*+±×÷·,-./:;<=>?¿@[\]^ˆ¨_`—–{|}~≈≠→↓¬’“”«»≫‘…¦›🌼′″¹§¼⅜½¾⅘©✒•►●★❤➡➜➚➘➔✔➓➒➑➐➏➎➍➌➋➊❸❷■†✝✌️³²‚„ " loadRes = LoadExternalLists() if self.language == 'de': self.stopwords = loadRes.loadStopWordsDE() # объект стеммера self.stemmer = GermanStemmer() # немецкий словарь print '\n', "Loading German Dictionary... OK", '\n' self.lexicon_de = loadRes.loadLexiconDe() self.normalizer = NormalizerDE() elif self.language == 'ru': self.stopwords = loadRes.loadStopWordsRU() self.stemmer = RussianStemmer() # объект pymorphy2.MorphAnalyzer(), будем использовать атрибут normal_form self.lemmatizer_ru = pymorphy2.MorphAnalyzer() self.normalizer = NormalizerRU() else: self.stopwords = loadRes.loadStopWordsEN() self.stemmer = PorterStemmer() self.normalizer = NormalizerEN() # список неправ. гл. self.irreg_verbs = loadRes.loadVerbForms() # список неправ. сущ-х self.irreg_nouns = loadRes.loadNounforms()
def search(self, word): # Create an instance of the Porter Stemmer. PS = PorterStemmer() # Get the information for the supplied word. res = self.manage_DB.get_info("index_word", where={"word": PS.stem(word, 0, len(word) - 1)}) # The supplied word exist in the index_word table. if res: # Extract the id for the supplied word. wordid = res["id"] # Get all the entries in the index reference database that refer to # the supplied wordid. res = self.manage_DB.get_info("index_ref", where={"wordid": wordid}) # For ever entry in the list. for row in res: # Modify the current row to contain the stem word. row["word"] = self.manage_DB.get_info("index_word", rowid=row[1])["word"] # Modify the current row to contain the document name. row["doc"] = self.manage_DB.get_info("document", rowid=row[2])["name"] # Return the list of all the results. return res # The supplied word does not exist in the index_word table, so return # and empty list. else: return []
def __init__(self, stopwords, VERBTRANSFORMS, NOUNTRANSFORMS, lexicon_de, language): self.language = language self.stopwords = stopwords self.VERBTRANSFORMS = VERBTRANSFORMS self.NOUNTRANSFORMS = NOUNTRANSFORMS # знаки, которые будут удаляться в начале и конце токена self.punctuation = "∙!‼¡\"#£€$¥%&'()*+±×÷·,-./:;<=>?¿@[\]^ˆ¨_`—–{|}~≈≠→↓¬’“”«»≫‘…¦›🌼′″¹§¼⅜½¾⅘©✩✒•►●★❤➡➜➚➘➔✔➓➒➑➐➏➎➍➌➋➊❸❷■†✝✌️³²‚„ " # для разбивки на токены по пробелам и слешам self.splitchars = re.compile(r'[\s\\\/\(\)\[\]\<\>\;\:\,\‚\—\?\!\|\"«»…#]|\.\.\.+|[ �⌂ ∞½¾►=]|\-\-|\.[\'\"’“”«»‘′″„-]') if self.language == 'ru': self.stemmer = RussianStemmer() # объект pymorphy2.MorphAnalyzer(), будем использовать атрибут normal_form self.lemmatizer_ru = pymorphy2.MorphAnalyzer() self.normalizer = NormalizerRU() elif self.language == 'de': self.stemmer = GermanStemmer() self.normalizer = NormalizerDE() self.lexicon_de = lexicon_de else: self.stemmer = PorterStemmer() self.normalizer = NormalizerEN()
def stemList(list1): p = PorterStemmer() lAllWordsStemmed = [] for word in list1: word = p.stem(word,0,len(word)-1) lAllWordsStemmed.append(word) return lAllWordsStemmed
def stemWords(tokens): ''' input: list of tokens output: list of stemmed tokens use the porter.py ''' p = PorterStemmer() return map(lambda t: p.stem(t, 0, len(t) - 1), tokens)
def get_stemmed_words(word_list): stemmer = PorterStemmer() stemmed_words = set() for word in word_list: stemmed_words = stemmed_words.union(stemmer.stem(word, 0,len(word)-1)) return stemmed_words
def process(input): s2 = tokenizeText(input) s3 = removeStopwords(s2) pr = [] from porter import PorterStemmer p = PorterStemmer() for element in s3: pr.append(p.stem(element, 0, len(element) - 1)) return pr
def read_vocabulary(path): f = open(path) vocab = f.read() words = vocab.strip().split(", ") vocab = [] stemmer = PorterStemmer() for word in words: vocab.append(stemmer.stem(word, 0, len(word)-1)) return vocab
def process(input): s3 = tokenizeText(input) """s3 = removeStopwords(s2)""" pr = [] from porter import PorterStemmer p = PorterStemmer() for element in s3: pr.append(p.stem(element, 0, len(element)-1)) return pr
def create_posting_list(self, stopword_toggle, stemming_toggle): """ function to go through all the documents abstracts cleaning and adding each term to a posting_list object and the term dictionary. removes all the special characters for each term. toggles stopwords and stemming accordingly Note: all terms are converted to lowercase :param stopword_toggle: boolean, toggles the stopword usage :param stemming_toggle: boolean, toggles the stemming of words """ self.terms = {} self.termsDictionary = {} documents = self.documents stopwords = [] if stopword_toggle: stopwords = fetch_stopwords() for doc_id, document in documents.items(): if 'abstract' in document: for index, word in enumerate(document['abstract'].split(' ')): word = word.rstrip().lower() for a in [',', '.', '{', '}', '(', ')', ';', ':', '"', '\'']: if a in word: if word.index(a) == 0 or word.index(a) == len(word) - 1: word = word.replace(a, '') if stemming_toggle: p = PorterStemmer() word = p.stem(word, 0, len(word) - 1) if word in stopwords: continue if len(word) > 0: if word not in self.terms.keys(): self.terms[word] = {} if doc_id not in self.terms[word].keys(): self.terms[word][doc_id] = { 'frequency': 0, 'position': [], } self.terms[word][doc_id]['frequency'] += 1 self.terms[word][doc_id]['position'].append(index) for term, value in self.terms.items(): self.termsDictionary[term] = len(value) f = open('dictionary.json', 'w') f.write(json.dumps(self.termsDictionary, indent=4, sort_keys=True)) f.close() f = open('posting-list.json', 'w') f.write(json.dumps(self.terms, indent=4, sort_keys=True)) f.close()
def index_document(self, docid, path_physical): self.manage_DB.delete_references(docid) # Get the information for the supplied document. document = self.manage_DB.get_info('document', rowid=docid) # Open the document for reading. fhandle = open('%s%s' % (path_physical, docid), 'r') # Create an instance of the Porter Stemmer. PS = PorterStemmer() # Get the 1st line of the supplied document and force the contents to # lowercase. content = fhandle.readline().lower() # The text widget starts indexing its lines at 1, but columns start # indexing at 0. line_count = 1 # While the supplied document has content to be read. while content != '': # Find all words from the current line of the supplied document # and put them in a list. words = re.findall('\w+', content) # For each word in the list of words from the current line. for word in words: # Only words whose length is greater than 3 will be indexed. if len(word) > 3: # Check for the word in the list of stop words. res = self.manage_DB.get_info('stop_words', where={ 'word': word}) # If the word does not exist in the list of stop words: if not res: # The column of the current word is its index in the # current line. col_count = content.find(word) + 1 # Using the PorterStemmer, find the root of the current # word. Add the root word, with the current line and # column number to the index. self.add_index_word( PS.stem(word, 0, len(word) - 1), docid, line_count, col_count, word) # Get the next line of the supplied document and force the # contents to lowercase. content = fhandle.readline().lower() # Increment the line count. line_count += 1 # Close the supplied document file. fhandle.close() return
def stem(tokens): """ receive tokens return stemmedTokens """ stemmedTokens = [] stemmer = PorterStemmer() for token in tokens: stemmedTokens.append(stemmer.stem(token, 0, len(token) - 1)) return stemmedTokens
def tokenize(inputStr): tokenPattern = re.compile(r'[^a-zA-Z0-9.,_]') # tokenPattern = re.compile(r'[\s:?;()\[\]&!*@#$%+<>/\\\'\"]|\.(\.)+|(-)+') primordialTokens = re.split(tokenPattern, inputStr) # primordialTokens = inputStr.replace(">", " ").replace("...", " ").replace("-"," ").replace("'"," ").replace("/"," ").split(' ') stripPuncTokens = [x.strip(',.').replace(",","").lower() for x in primordialTokens if x != None] stripPuncTokens = [x for x in stripPuncTokens if x != '' and x not in stop_words] #stemming p = PorterStemmer() stemmedTokens = [p.stem(x, 0, len(x)-1) for x in stripPuncTokens] return stemmedTokens
def porter(text): p = PorterStemmer() output = '' word = '' line = text.split('\n') for c in line: if c.isalpha(): word += c.lower() else: if word: output += p.stem(word, 0, len(word) - 1) word = '' output += c.lower() return output.split()
def add(self, text, fname, stem=False): """Add a string of text to the corpus by first splitting it into features defined by WORD_PAT, and then removing stop words. Takes a string as its argument.""" for match in re.finditer(self.WORD_PATT, text): if match: word = match.group(0).lower() if word in self.STOPWORDS: self.removed.append(word) self.words.add_word(word, fname) continue if stem: p = PorterStemmer() word = p.stem(word, 0, len(word)-1)
def revise_documents(docs, vocab): stemmer = PorterStemmer() senses = {} # {reference:sense} for ref, text in docs.items(): words = re.findall(r"[\w']+", text) word_list = [] for w in words: if w == "tag": continue if w.isdigit() and int(w) > 100000: senses[ref] = w continue if w in vocab: word_list.append(stemmer.stem(w.lower(), 0, len(w) - 1)) docs[ref] = word_list return docs, senses
class WindowPorterStemStringFeature(object): def __init__(self): self._stemmer = PorterStemmer() def get_id(self): return 'WINDOW-STEM-STRING' def _stem(self, token): return self._stemmer.stem(token, 0, len(token) - 1) def featurise(self, document, sentence, annotation): NORMALISE = True before_ann = sentence.text[:annotation.start].split() before_ann.reverse() after_ann = sentence.text[annotation.end:].split() to_yield = [] for i, tok in izip(xrange(1, 4), before_ann): to_yield.append((u'-BEFORE-{}-{}'.format(i, self._stem(tok)), 1)) for i, tok in izip(xrange(1, 4), after_ann): to_yield.append((u'-AFTER-{}-{}'.format(i, self._stem(tok)), 1)) for f_tup in to_yield: if NORMALISE: yield (f_tup[0], f_tup[1] / float(len(to_yield))) else: yield f_tup
class Pepper(object): """ The Pepper Pots of UI (Public Relations) for Tony Stark. Handles the user inputting queries, parsing the queries, and returning results from the indexed corpus by Ironman """ def __init__(self, documents, NDC, stop_words): super(Pepper, self).__init__() self.documents = documents self.NDC = NDC self.p = PorterStemmer() self.stop_words = stop_words def handleQuery(self, user_input): """ Handles the process of formatting a user_inputted query """ scores = [] stem_query = self.p.stemText(user_input, self.stop_words).encode('utf_8', 'ignore') query = Document(stem_query, full_text=user_input) self.NDC.normalize(query) for document in self.documents: scores.append((self.NDC.score(query, document), document)) scores = sorted(scores, reverse=True) return scores def score(query, document): return 1
def __init__(self, db_file, indexer): self.db_file = db_file self.indexer = indexer self.db = sqlite3.connect(db_file) self.p = PorterStemmer() self.sw = stopwords.StopWords(self.stopword_file) self.re_tag = RE_TAG
def score_query(self, query, word_matrix, normalized_matrix, stop_words_list, title_vocabulary_dict): porter_stemmer = PorterStemmer() square_sum = 0 words = {} for word in query: word_without_punctuation = word.strip(string.punctuation).replace( " ", "").lower() if word_without_punctuation not in stop_words_list: stemmed_word = porter_stemmer.stem( word_without_punctuation, 0, len(word_without_punctuation) - 1) if stemmed_word not in words: words[stemmed_word] = {} words[stemmed_word]['repetitions'] = 0 words[stemmed_word]['repetitions'] += 1 for word, elements in words.items(): square_sum += math.pow(elements['repetitions'], 2) for word, elements in words.items(): if word in word_matrix: words[word]['normalized'] = words[word][ 'repetitions'] / math.sqrt(square_sum) words[word]['weight'] = words[word][ 'normalized'] * word_matrix[word]['idf'] else: words[word]['normalized'] = 0 words[word]['weight'] = 0 aggregate_scores = {} title_addition_performed = [] for word, elements in words.items(): if word in normalized_matrix: for doc_id, doc_weight in normalized_matrix[word].items(): if doc_id not in aggregate_scores: aggregate_scores[doc_id] = 0 aggregate_scores[doc_id] += doc_weight * elements['weight'] if word in title_vocabulary_dict: if doc_id in title_vocabulary_dict[ word] and doc_id not in title_addition_performed: aggregate_scores[doc_id] += 0.5 title_addition_performed.append(doc_id) return aggregate_scores
def load(self, ndxfile): with open(ndxfile, "r") as f: self.ndx = json.loads(f.read()) self.p = PorterStemmer() self.sw = stopwords.StopWords(self.stopword_file) for w, val in self.ndx.items(): for d in val: self.dim_map[d] = len(self.dims) - 1 self.dims.add(d)
def search(self, word): # Create an instance of the Porter Stemmer. PS = PorterStemmer() # Get the information for the supplied word. res = self.manage_DB.get_index_word_info( PS.stem(word, 0, len(word) - 1)) # The supplied word exist in the index_word table. if res: # Extract the id for the supplied word. wordid = res['id'] # Return the found entries as a list. res = [] # Query the index_ref table for all the entries whose wordid # match the supplied word's id. self.c.execute("""select * from index_ref where wordid=?""", (wordid,)) # Retrieve all the results of the query as a list. entries = self.c.fetchall() # For ever entry in the list. for row in entries: # Create a dictionary with the results and add the dictionary # to the list. res.append({ 'id': row[0], 'word': self.manage_DB.get_index_word_info(row[1])['word'], 'docid': row[2], 'doc': self.manage_DB.get_document_info(row[2])['name'], 'line': row[3], 'column': row[4], 'branch_word': row[5]}) # Return the list of all the results. return res # The supplied word does not exist in the index_word table, so return # and empty list. else: return []
def tokenize_on_porter(text): word_list = [] p = PorterStemmer() outfile = open('out3', 'w') for line in text.splitlines(): output = '' word = '' if line != '': for c in line: if c.isalpha(): word += c.lower() else: if word: word_stem = p.stem(word, 0, len(word)-1) output += word_stem word_list.append(word_stem) word = '' output += c.lower() print(output, end='\n', file=outfile) outfile.close() return word_list
def make_cloud(self): stemdict, tempdict, finaldict = {}, {}, {} stopwords = open('stopwords.txt', 'r').read().split('\n') # Extract just the words inside quotes quotes = ' '.join(self.extract_quotes()) wordlist = re.split('\s+', quotes.lower()) p = PorterStemmer() punctuation = re.compile(r'[.?!,":;-]') # Stem all of the words in the word list using the Porter Stemmer for w in wordlist: w = punctuation.sub('', w) s = p.stem(w, 0,len(w)-1) try: tempdict[w] += 1 except: tempdict[w] = 1 stemdict.setdefault(s,{}).update({w:tempdict[w]}) cumfreq = 0 # Calculate the cumulative frequencies of the stemmed words for k, v in stemdict.items(): for l, m in v.items(): cumfreq = cumfreq + m items = v.items() items.sort(lambda x, y: cmp(y[1], x[1])) finaldict[items[0][0]] = cumfreq cumfreq = 0 # Remove stopwords like "the", "it", "a", etc. for word in stopwords: try: del finaldict[word] except: pass results = self.process_cloud(8, finaldict.items()[:50]) return results
def search(self, word, docid=None): # Create an instance of the Porter Stemmer. PS = PorterStemmer() # Get the information for the supplied word. res = self.manage_DB.get_info('index_word', where={ 'word': PS.stem(word, 0, len(word) - 1)}) # The supplied word exist in the index_word table. if res: # Extract the id for the supplied word. wordid = res[0]['id'] if docid: # Get all the entries in the index reference database that refer to # the supplied wordid. res = self.manage_DB.get_info('index_ref', where={ 'wordid': wordid, 'docid': docid}) else: # Get all the entries in the index reference database that refer to # the supplied wordid. res = self.manage_DB.get_info('index_ref', where={ 'wordid': wordid}) # For ever entry in the list. for row in res: # Modify the current row to contain the stem word. row['word'] = self.manage_DB.get_info( 'index_word', rowid=row['wordid'])['word'] # Modify the current row to contain the document name. row['doc'] = self.manage_DB.get_info( 'document', rowid=row['docid'])['name'] # Return the list of all the results. return res # The supplied word does not exist in the index_word table, so return # and empty list. else: return []
class SentencePorterStemStringFeature(object): def __init__(self): self._stemmer = PorterStemmer() def _stem(self, token): return self._stemmer.stem(token, 0, len(token) - 1) def get_id(self): return 'STRING-SENTENCE-STEM' def featurise(self, document, sentence, annotation): for token in sentence.text.split(): yield (self._stem(token), 1)
class StringPorterStemFeature(object): def __init__(self): self._stemmer = PorterStemmer() def _stem(self, token): return self._stemmer.stem(token, 0, len(token) - 1) return self._stemmer.stem(token, 0, len(token) - 1) def get_id(self): return 'STRING-STEM' def featurise(self, document, sentence, annotation): yield (self._stem(sentence.annotation_text(annotation)), 1)
def __init__(self): ''' if phrase_dict_json != None: extract the phrase features if subtype_flag = True, extract the features by sub parse_type if bioe_flag = True, use the BIOE tags ''' self.features = [ 'pos', 'chunk', 'promptword', 'stopword', 'tf', 'rank', 'color' ] if 'pos' in self.features: self.pos_tagger = SennaTagger(global_params.sennadir) if 'chunk' in self.features: self.chunk_tagger = SennaChunkTagger(global_params.sennadir) self.sentences = [] self.porter = PorterStemmer() self.token_dict = None self.bins = 50
def getDocStuff(dDocProps): lAllLists = [] if (constants.T in dDocProps): lAllLists.append(dDocProps[constants.T]) putinDPLace("1",dDocProps[constants.T]) if (constants.W in dDocProps): lAllLists.append(dDocProps[constants.W]) putinDPLace("2",dDocProps[constants.W]) if (constants.A in dDocProps): lAllLists.append(dDocProps[constants.A]) putinDPLace("3",dDocProps[constants.A]) lAllLines = [] for lList in lAllLists: lAllLines.extend(lList) lAllWords = [] for sLine in lAllLines: sLine = re.sub('[^a-zA-Z0-9]', ' ', sLine) lWords = sLine.lower().split() lAllWords.extend(lWords) lw = copy.deepcopy(lAllWords) lAllWords = helperFunctions.remStopWords(lAllWords) p = PorterStemmer() lAllWordsStemmed = [] for word in lAllWords: word = p.stem(word,0,len(word)-1) lAllWordsStemmed.append(word) lUniqueWords = list(set(lAllWordsStemmed)) lenAllWords = len(lAllWordsStemmed) constants.allDocsLen = constants.allDocsLen+lenAllWords sRet = helperFunctions.makeFixedLengthStr(len(lAllWordsStemmed),constants.docWordCntLen)+constants.space+helperFunctions.makeFixedLengthStr(len(lUniqueWords),constants.docWordCntLen)+constants.newLine return [sRet,lAllWordsStemmed," ".join(lw)]
class Parser(object): """ The parsing workhorse of the entire project. """ def __init__(self, stop_words, **kwargs): """ The constructor for the Parser object. @stop_words could be one a list of stop words, or None """ super(Parser, self).__init__() # Checks if stop_words is a list if stop_words is not None: self.stop_words = [] for word in stop_words: self.stop_words.append(word.lower()) else: self.stop_words = None self.hashes = [] self.documents = [] self.num_duplicates = 0 self.p = PorterStemmer() def retrieveText(self, page_soup, url): """ Retrieves all the non-markup text from a webpage that has already been crawled. @page_soup: The soupified version of a webpage """ # Retrieve all the text of the page minus the html tags page_text = page_soup.get_text() # Stems and returns all the non-stopword text stem_text = self.p.stemText(page_text, self.stop_words).encode('utf_8', 'ignore') # Create a hash to make sure there are no 100% duplicates in the pages # The hex digest will also be used as the document ID, since they will # be unique unless they are a duplicate h = hashlib.md5() h.update(stem_text) page_hash = h.hexdigest() # If the page is not a duplicate, add the hash to a list of found # hashes, and create a Document object to keep track of the information # for each Document if page_hash not in self.hashes: self.hashes.append(page_hash) self.documents.append(Document(stem_text, page_text, url, page_hash)) else: self.num_duplicates += 1
def getDocStuff(dDocProps): global T,W,B,A,N,I lAllLists = [] if (T in dDocProps): lAllLists.append(dDocProps[T]) if (W in dDocProps): lAllLists.append(dDocProps[W]) #if (B in dDocProps): # lAllLists.append(dDocProps[B]) if (A in dDocProps): lAllLists.append(dDocProps[A]) #if (N in dDocProps): # lAllLists.append(dDocProps[N]) lAllLines = [] for lList in lAllLists: lAllLines.extend(lList) lAllWords = [] for sLine in lAllLines: lWords = sLine.split() lAllWords.extend(lWords) lAllWords = helperFunctions.remStopWords(lAllWords) p = PorterStemmer() lAllWordsStemmed = [] for word in lAllWords: word = p.stem(word,0,len(word)-1) lAllWordsStemmed.append(word) #print("All words :", lAllWordsStemmed,"\n") lUniqueWords = list(set(lAllWordsStemmed)) lenAllWords = len(lAllWordsStemmed) lenAllWords sRet = makeFixedLengthStr(len(lAllWordsStemmed),6)+" "+makeFixedLengthStr(len(lUniqueWords),6) #+":"+dDocProps[B][0] return [sRet,lAllWordsStemmed]
def process_query(self, query): all_doc_count = len(self.invert.documents.keys()) query_array = [x.lower() for x in query.split(' ')] query_weights = {} stopwords = [] if self.stopword_toggle: stopwords = fetch_stopwords() while query_array: word = query_array.pop(0) frequency = 1 for a in [',', '.', '{', '}', '(', ')', ';', ':', '"', '\'']: if a in word: if word.index(a) == 0 or word.index(a) == len(word) - 1: word = word.replace(a, '') while word in query_array: query_array.pop(query_array.index(word)) frequency += 1 if self.stemming_toggle: p = PorterStemmer() word = p.stem(word, 0, len(word) - 1) if word in stopwords: continue term_weight = 0 if word in self.invert.termsDictionary.keys(): document_frequency = self.invert.termsDictionary[word] idf = math.log(all_doc_count / document_frequency) term_frequency = 1 + math.log(frequency) term_weight = idf * term_frequency query_weights[word] = term_weight return query_weights
def main(argv): files = os.listdir(sys.argv[1]) file = files[0] stemmed = [] for file in files: text = "" infile = open(sys.argv[1] + file) a = infile.readline() while a: text += removeSGML(a) a = infile.readline() tok = tokenizeText(text) removed = removeStopwords(tok) from porter import PorterStemmer p = PorterStemmer() for element in removed: stemmed.append(p.stem(element, 0, len(element)-1)) print "Words " + str(len(stemmed)) unique = list(set(stemmed)) print "Vocabulary " + str(len(unique)) wordfrequency = [(unique[x], stemmed.count(unique[x])) for x in range(0,len(unique))] sort = sorted(wordfrequency, key = getKey, reverse = True) for i in range(0,49): print sort[i]
def __init__(self, stop_words, **kwargs): """ The constructor for the Parser object. @stop_words could be one a list of stop words, or None """ super(Parser, self).__init__() # Checks if stop_words is a list if stop_words is not None: self.stop_words = [] for word in stop_words: self.stop_words.append(word.lower()) else: self.stop_words = None self.hashes = [] self.documents = [] self.num_duplicates = 0 self.p = PorterStemmer()
def initBooleanQuery(): #start_time = time.time() term2tidFile = open("term2tid.json", "r") indexFile = open("invertedIndex.json", "r") global term2id term2id = json.load(term2tidFile) global invertedIndex invertedIndex = json.load(indexFile) #print("--- %s seconds ---" % (time.time() - start_time)) term2tidFile.close() indexFile.close() global wholeList wholeList = range(utils.docCount) global wholeLen wholeLen = utils.docCount global p p = PorterStemmer()
def __init__(self): ''' if phrase_dict_json != None: extract the phrase features if subtype_flag = True, extract the features by sub parse_type if bioe_flag = True, use the BIOE tags ''' self.features = ['pos', 'chunk', 'promptword', 'stopword', 'tf', 'rank', 'color'] if 'pos' in self.features: self.pos_tagger = SennaTagger(global_params.sennadir) if 'chunk' in self.features: self.chunk_tagger = SennaChunkTagger(global_params.sennadir) self.sentences = [] self.porter = PorterStemmer() self.token_dict = None self.bins = 50
def build(self, docpath, outfile): p = PorterStemmer() sw = stopwords.StopWords(self.stopword_file) ndx = defaultdict(list) for filename in os.listdir(docpath): if not filename.endswith(".txt"): continue doc_id = hash(filename.replace(".txt", "")) with open(os.path.join(docpath, filename)) as f: f_content = kwutils.normalize(f.read().lower()) words = kwutils.tokenize(f_content) w_stemmed = kwutils.stem(words, p) w_stopped = kwutils.filter_stopwords(w_stemmed, sw) for word in w_stopped: if len(word) > 0: if not doc_id in ndx[word]: ndx[word].append(doc_id) with open(outfile, 'w') as f: f.write(json.dumps(ndx))
class CRF_Extractor: ''' extract features for the CRF model each line is a feature vector for a token ''' def __init__(self): ''' if phrase_dict_json != None: extract the phrase features if subtype_flag = True, extract the features by sub parse_type if bioe_flag = True, use the BIOE tags ''' self.features = ['pos', 'chunk', 'promptword', 'stopword', 'tf', 'rank', 'color'] if 'pos' in self.features: self.pos_tagger = SennaTagger(global_params.sennadir) if 'chunk' in self.features: self.chunk_tagger = SennaChunkTagger(global_params.sennadir) self.sentences = [] self.porter = PorterStemmer() self.token_dict = None self.bins = 50 def add_sentence(self, sentence): self.sentences.append(sentence) def get_token_tf(self): self.token_dict = defaultdict(float) for tokens, _, _ in self.sentences: for token in self.porter.stem_tokens(tokens): self.token_dict[token] += 1.0 self.rank_dict = defaultdict(int) rank_tokens = sorted(self.token_dict, key=self.token_dict.get, reverse=True) self.rank_dict = defaultdict(int) for i, token in enumerate(rank_tokens): self.rank_dict[token] = int(i*10/len(rank_tokens)) for t, v in self.token_dict.items(): #normalized by the number of sentences x = v/len(self.sentences) if x > 1.0: x = 1.0 self.token_dict[t] = x def get_feature_names(self): return '_'.join(self.features) def get_i_j(self, body, i, j): ''' return the value of the crf template feature u[i, j] intput: body: [][], two-dimentionary array, representing the crf features for a sentence i: int, the index of i j: int, the index of j ''' n = len(body) if i < 0: v = '_x%d'%(i) elif i >= n: v = '_x+%d'%(i-n+1) else: v = body[i][j] return v def extract_U_i_j(self, data_body, feature_body, i, j, tag): ''' extract the U[i, j] feature, and add it to the end of each row intput: data_body: [][], two-dimentionary array, representing the crf data for a sentence feature_body: [][], two-dimentionary array, the resulting feature data for a sentence i: int, the index of i j: int, the index of j tag: the prefix of the feature name ''' for k, row in enumerate(feature_body): row.append('%s:%s'%(tag, self.get_i_j(data_body, k+i, j))) def extract_U_i_j_m_n(self, data_body, feature_body, i, j, m, n, tag): ''' extract the U[i, j]/U[m, n] feature, and add it to the end of each row ''' for k, row in enumerate(feature_body): row.append('%s:%s/%s'%(tag, self.get_i_j(data_body, k+i, j), self.get_i_j(data_body, k+m, n))) def extract_U_i_j_m_n_x_y(self, data_body, feature_body, i, j, m, n, x, y, tag): ''' extract the U[i, j]/U[m, n]/U[x,y] feature, and add it to the end of each row ''' for k, row in enumerate(feature_body): row.append('%s:%s/%s/%s'%(tag, self.get_i_j(data_body, k+i, j), self.get_i_j(data_body, k+m, n), self.get_i_j(data_body, k+x, y))) def extract_word_U_i_j(self, data_body, index, feature_body, i, j, tag): ''' extract the U[i, j] feature, and add it to the end of each row ''' for k, row in zip(index, feature_body): row.append('%s:%s'%(tag, self.get_i_j(data_body, k+i, j))) def extract_word_U_i_j_m_n(self, data_body, index, feature_body, i, j, m, n, tag): ''' extract the U[i, j]/U[m, n] feature, and add it to the end of each row ''' for k, row in zip(index, feature_body): row.append('%s:%s/%s'%(tag, self.get_i_j(data_body, k+i, j), self.get_i_j(data_body, k+m, n))) def extract_word_U_i_j_m_n_x_y(self, data_body, index, feature_body, i, j, m, n, x, y, tag): ''' extract the U[i, j]/U[m, n] feature, and add it to the end of each row ''' for k, row in zip(index, feature_body): row.append('%s:%s/%s/%s'%(tag, self.get_i_j(data_body, k+i, j), self.get_i_j(data_body, k+m, n), self.get_i_j(data_body, k+x, y))) def extract_bigram(self, body): ''' extract the bigram feature for the crf template ''' for row in body: row.append('b') def extract_crf_features(self, tokens, tags, prompt, colors=None): ''' Extract the character features, each token a line return: [][], two dimentionary array, representing the feature data of the sentence ''' body = [] words = tokens N = len(tokens) #first row: the word token for word in words: row = [] row.append(word) body.append(row) if 'pos' in self.features: pos_tags = self.pos_tagger.tag(tokens) for i, (_, p_tag) in enumerate(pos_tags): body[i].append(p_tag) if 'chunk' in self.features: chunk_tags = self.chunk_tagger.tag(tokens) for i, (_, c_tag) in enumerate(chunk_tags): body[i].append(c_tag) if 'promptword' in self.features: for i, token in enumerate(tokens): if token in prompt_words[prompt]: body[i].append('Y') else: body[i].append('N') if 'stopword' in self.features: for i, token in enumerate(tokens): if token in stopwords: body[i].append('Y') else: body[i].append('N') if 'tf' in self.features: if self.token_dict == None: self.get_token_tf() for i, token in enumerate(self.porter.stem_tokens(tokens)): assert(token in self.token_dict) x = int(self.token_dict[token]*self.bins) body[i].append(str(x)) if 'rank' in self.features: if self.rank_dict == None: self.get_token_tf() for i, token in enumerate(self.porter.stem_tokens(tokens)): assert(token in self.rank_dict) x = self.rank_dict[token] body[i].append(str(x)) if 'color' in self.features and colors != None: for color in colors: for i, tag in enumerate(tags): body[i].append(str(color[i])) #last row: tags = [tag for tag in tags] for i, tag in enumerate(tags): body[i].append(tag) return body
poslineedited = [] neglinesedited = [] #there are total 6397 positives and negatives. poslinesTrain= poslines[:3201] neglinesTrain= neglines[:3196] priorknowledgepo = [] priorknowledgeneg = [] priorknowledgeneg= 3196/ 6397 priorknowledgepo = 3201/ 6397 stemmer = PorterStemmer() model = open('F:/ifa/NaiveBayes/model_file.csv', 'w',encoding="utf8") trainset= [(x,1) for x in poslinesTrain] + [(x,-1) for x in neglinesTrain] poswords={} #this dictionary stores counts for every word in positives negwords={} #and negatives for line,label in trainset: words= getwords(line) for word in words: word.lower() #increment the counts for this word based on the label #the .get(x, 0) method returns the current count for word #x, of 0 if the word is not yet in the dictionary
class CRF_Extractor: ''' extract features for the CRF model each line is a feature vector for a token ''' def __init__(self): ''' if phrase_dict_json != None: extract the phrase features if subtype_flag = True, extract the features by sub parse_type if bioe_flag = True, use the BIOE tags ''' self.features = [ 'pos', 'chunk', 'promptword', 'stopword', 'tf', 'rank', 'color' ] if 'pos' in self.features: self.pos_tagger = SennaTagger(global_params.sennadir) if 'chunk' in self.features: self.chunk_tagger = SennaChunkTagger(global_params.sennadir) self.sentences = [] self.porter = PorterStemmer() self.token_dict = None self.bins = 50 def add_sentence(self, sentence): self.sentences.append(sentence) def get_token_tf(self): self.token_dict = defaultdict(float) for tokens, _, _ in self.sentences: for token in self.porter.stem_tokens(tokens): self.token_dict[token] += 1.0 self.rank_dict = defaultdict(int) rank_tokens = sorted(self.token_dict, key=self.token_dict.get, reverse=True) self.rank_dict = defaultdict(int) for i, token in enumerate(rank_tokens): self.rank_dict[token] = int(i * 10 / len(rank_tokens)) for t, v in self.token_dict.items( ): #normalized by the number of sentences x = v / len(self.sentences) if x > 1.0: x = 1.0 self.token_dict[t] = x def get_feature_names(self): return '_'.join(self.features) def get_i_j(self, body, i, j): ''' return the value of the crf template feature u[i, j] intput: body: [][], two-dimentionary array, representing the crf features for a sentence i: int, the index of i j: int, the index of j ''' n = len(body) if i < 0: v = '_x%d' % (i) elif i >= n: v = '_x+%d' % (i - n + 1) else: v = body[i][j] return v def extract_U_i_j(self, data_body, feature_body, i, j, tag): ''' extract the U[i, j] feature, and add it to the end of each row intput: data_body: [][], two-dimentionary array, representing the crf data for a sentence feature_body: [][], two-dimentionary array, the resulting feature data for a sentence i: int, the index of i j: int, the index of j tag: the prefix of the feature name ''' for k, row in enumerate(feature_body): row.append('%s:%s' % (tag, self.get_i_j(data_body, k + i, j))) def extract_U_i_j_m_n(self, data_body, feature_body, i, j, m, n, tag): ''' extract the U[i, j]/U[m, n] feature, and add it to the end of each row ''' for k, row in enumerate(feature_body): row.append('%s:%s/%s' % (tag, self.get_i_j( data_body, k + i, j), self.get_i_j(data_body, k + m, n))) def extract_U_i_j_m_n_x_y(self, data_body, feature_body, i, j, m, n, x, y, tag): ''' extract the U[i, j]/U[m, n]/U[x,y] feature, and add it to the end of each row ''' for k, row in enumerate(feature_body): row.append('%s:%s/%s/%s' % (tag, self.get_i_j( data_body, k + i, j), self.get_i_j( data_body, k + m, n), self.get_i_j(data_body, k + x, y))) def extract_word_U_i_j(self, data_body, index, feature_body, i, j, tag): ''' extract the U[i, j] feature, and add it to the end of each row ''' for k, row in zip(index, feature_body): row.append('%s:%s' % (tag, self.get_i_j(data_body, k + i, j))) def extract_word_U_i_j_m_n(self, data_body, index, feature_body, i, j, m, n, tag): ''' extract the U[i, j]/U[m, n] feature, and add it to the end of each row ''' for k, row in zip(index, feature_body): row.append('%s:%s/%s' % (tag, self.get_i_j( data_body, k + i, j), self.get_i_j(data_body, k + m, n))) def extract_word_U_i_j_m_n_x_y(self, data_body, index, feature_body, i, j, m, n, x, y, tag): ''' extract the U[i, j]/U[m, n] feature, and add it to the end of each row ''' for k, row in zip(index, feature_body): row.append('%s:%s/%s/%s' % (tag, self.get_i_j( data_body, k + i, j), self.get_i_j( data_body, k + m, n), self.get_i_j(data_body, k + x, y))) def extract_bigram(self, body): ''' extract the bigram feature for the crf template ''' for row in body: row.append('b') def extract_crf_features(self, tokens, tags, prompt, colors=None): ''' Extract the character features, each token a line return: [][], two dimentionary array, representing the feature data of the sentence ''' body = [] words = tokens N = len(tokens) #first row: the word token for word in words: row = [] row.append(word) body.append(row) if 'pos' in self.features: pos_tags = self.pos_tagger.tag(tokens) for i, (_, p_tag) in enumerate(pos_tags): body[i].append(p_tag) if 'chunk' in self.features: chunk_tags = self.chunk_tagger.tag(tokens) for i, (_, c_tag) in enumerate(chunk_tags): body[i].append(c_tag) if 'promptword' in self.features: for i, token in enumerate(tokens): if token in prompt_words[prompt]: body[i].append('Y') else: body[i].append('N') if 'stopword' in self.features: for i, token in enumerate(tokens): if token in stopwords: body[i].append('Y') else: body[i].append('N') if 'tf' in self.features: if self.token_dict == None: self.get_token_tf() for i, token in enumerate(self.porter.stem_tokens(tokens)): assert (token in self.token_dict) x = int(self.token_dict[token] * self.bins) body[i].append(str(x)) if 'rank' in self.features: if self.rank_dict == None: self.get_token_tf() for i, token in enumerate(self.porter.stem_tokens(tokens)): assert (token in self.rank_dict) x = self.rank_dict[token] body[i].append(str(x)) if 'color' in self.features and colors != None: for color in colors: for i, tag in enumerate(tags): body[i].append(str(color[i])) #last row: tags = [tag for tag in tags] for i, tag in enumerate(tags): body[i].append(tag) return body
class BuildTermSpace(object): """ Создание json-объекта в виде словаря, в котором хранятся стеммы значимых слов и их частотность из указанных корпусов. """ def __init__(self, language='en', action='tfidf'): # Вызываем LoadExternalLists, создаем список стоп-слов, # загружаем немецкий лексикон, # self.language = language self.action = action # знаки, которые будут удаляться в начале и конце токена self.punctuation = "∙!‼¡\"#£€$¥%&'()*+±×÷·,-./:;<=>?¿@[\]^ˆ¨_`—–{|}~≈≠→↓¬’“”«»≫‘…¦›🌼′″¹§¼⅜½¾⅘©✒•►●★❤➡➜➚➘➔✔➓➒➑➐➏➎➍➌➋➊❸❷■†✝✌️³²‚„ " loadRes = LoadExternalLists() if self.language == 'de': self.stopwords = loadRes.loadStopWordsDE() # объект стеммера self.stemmer = GermanStemmer() # немецкий словарь print '\n', "Loading German Dictionary... OK", '\n' self.lexicon_de = loadRes.loadLexiconDe() self.normalizer = NormalizerDE() elif self.language == 'ru': self.stopwords = loadRes.loadStopWordsRU() self.stemmer = RussianStemmer() # объект pymorphy2.MorphAnalyzer(), будем использовать атрибут normal_form self.lemmatizer_ru = pymorphy2.MorphAnalyzer() self.normalizer = NormalizerRU() else: self.stopwords = loadRes.loadStopWordsEN() self.stemmer = PorterStemmer() self.normalizer = NormalizerEN() # список неправ. гл. self.irreg_verbs = loadRes.loadVerbForms() # список неправ. сущ-х self.irreg_nouns = loadRes.loadNounforms() def processString(self, line): """ Функция последовательной обработки каждого слова. Получает на вход строку, создает список tokens, складывает туда выделенные re.split'ом слова, 'отрезая' пунктуацию с концов слова и понижая регистр, и удаляет по ходу окончания-сокращения функцией del_contractions Дальше переходит к формированию списка стеммированных терминов rslt_list с удалением стоп-слов и цифровых последовательностей. Возвращает список rslt_list, в котором содержатся только стеммы значимых слов. """ # для разбивки на токены по пробелам и слешам splitchars = re.compile(r'[\s\\\/\(\)\[\]\<\>\;\:\,\‚\—\?\!\|\"«»…#]|\.\.\.+|[ �⌂ ∞½¾►=]|\-\-|\.[\'\"’“”«»‘′″„-]') # [\.\:][\'\"’“”«»‘′″] # для игнорирования токенов, содержащих цифры esc_num = re.compile(r'[0-9]+') # для игнорирования URL #url_esc = re.compile(r'([a-z]{3,6}:\/\/)?([a-zA-Z0-9\-@?]+[\.|\:])+[a-z]{2,13}[\.\?\=\&\%\,\#\+\(\)\/\w\-]*') if self.language == 'de': tokens = (self.normalizer.normalizeUmlaut(self.normalizer.deleteContrs(token.strip(self.punctuation).lower())) for token in splitchars.split(line)) rslt_list = (self.stemmer.stem(self.normalizer.lemmatize(term, self.lexicon_de)) for term in tokens if term not in self.stopwords and not esc_num.search(term) and len(term)>0) # and not esc_num.search(term) - включить после услоия на стоп-слова, если нужно удалять токены с цифрами elif self.language == 'ru': tokens = (self.normalizer.normalizeE(token.strip(self.punctuation).lower()) for token in splitchars.split(line)) rslt_list = (self.stemmer.stem(self.lemmatizer_ru.parse(term)[0].normal_form) for term in tokens if term not in self.stopwords and not esc_num.search(term) and len(term)>0) # and not esc_num.search(term) - включить после услоия на стоп-слова, если нужно удалять токены с цифрами else: # генератор списка токенов: по циклу: разбиваем строку на токены по regexp splitchars, # 2. удаляем знаки вокруг токена, приводим к нижнему регистру, # 3. трансформируем форму неправ. глаг. в правильную # 4. удаляем окончания-сокращения с \' tokens = (self.normalizer.token_transform(self.normalizer.del_contractions(token.strip(self.punctuation).lower()), self.irreg_verbs, self.irreg_nouns) for token in splitchars.split(line)) # генератор списка терминов: если термин не в списке стоп-слов и не содержит цифр, то стеммируем его. rslt_list = (self.stemmer.stem(term, 0, len(term)-1) for term in tokens if term not in self.stopwords and not esc_num.search(term) and len(term)>0) # and not esc_num.search(term) - включить после услоия на стоп-слова, если нужно удалять токены с цифрами if not rslt_list: return [] else: return rslt_list def processFile(self, filename): """ Читает файл в utf-16, для каждой строки файла вызывает функцию processString, каждое слово из получившегося списка добавляет в set terms_set, избавляясь от дубликатов. Возвращает список уникальных лемм. """ terms_set = set() terms_list = [] if self.action == 'tfidf': try: with codecs.open(filename, 'r', 'utf-16') as infile: for line in infile: if len(line) > 1: for term in self.processString(line): terms_set.add(term) except (UnicodeDecodeError, UnicodeError, IOError): pass return terms_set if self.action == 'raw': try: with codecs.open(filename, 'r', 'utf-16') as infile: for line in infile: if len(line) > 1: for term in self.processString(line): terms_list.append(term) except (UnicodeDecodeError, UnicodeError, IOError): pass return terms_list def crawl(self, dirname): """ Функция проходит по папкам и подпапкам указанной в качестве аргумента директории. Проверяет, если файл текстовый, то запускает функцию processFile и складывает результат её работы в set terms_set. В общем terms_dict подсчитывается частотность каждой леммы, словарь сохраняется как json. terms_dict отражает по сути вторую часть формулы tfidf, т.е. показывает в каком количестве документов встретился термин. """ docs_num = 0 terms_dict = defaultdict(int) for root, dirs, files in os.walk(dirname): print root, "processing..." for filename in files: if filename.endswith('.txt') or filename.endswith('.TXT'): print filename terms_set = self.processFile(join(root,filename)) for term in terms_set: terms_dict[term] += 1 docs_num+=1 if self.action == 'raw': with codecs.open(r'.\termSpace\\'+self.language.upper()+'frequency_list_stem.txt', 'w', 'utf-16') as outfile: for key, value in sorted(terms_dict.iteritems(), key=lambda x:x[1], reverse=True): outfile.write(key+'\t'+str(value)) outfile.write('\n') if self.action == 'tfidf': with open(r".\termSpace\\" + self.language.upper() + "CorpusDict_" + str(docs_num) + ".json", 'w') as outfile: json.dump(terms_dict, outfile)
# MIT Licensed # Copyright 2014 REM <*****@*****.**>. from pymongo import MongoClient from pymongo import DESCENDING import utility from porter import PorterStemmer p = PorterStemmer() client = MongoClient('localhost', 27017) db = client.uberly clt = db.uber_vocab_1 #db.uber_dictionary.find().limit(50).sort({value:-1}).pretty() for entry in clt.find().sort([('value', DESCENDING)]): entry['stem'] = p.stem(entry['_id'], 0,len(entry['_id'])-1) clt.save(entry) print(entry['_id']), entry['stem']
def stem(word): p = PorterStemmer() return p.stem(word, 0, len(word) - 1)
import re from porter import PorterStemmer p = PorterStemmer() def lcase(text): return text.lower() def prefixes(text): return [text[:3], text[:4], text[:5]] def suffixes(text): return [text[-3:], text[-4:], text[-5:]] def stem(text): if text.isalpha(): return p.stem(text.lower(), 0, len(text) - 1) return text def is_pair_of_digits(text): if re.match("^[0-9]{2}$", text): return True return False def is_four_digits(text): if re.match("^[0-9]{4}$", text):
if(lData[0] == query): return lData[1:] elif (query < lData[0]): print (query+ " < "+lData[0]) return search(query,index+1,middle,m1) elif (query > lData[0]): print (query+ " > "+lData[0]+" so go to index : "+str(index+middle)) return search(query,index+middle,middle,m1) else: return [] print(" size : "+str(int(m1.size()/35))) p = PorterStemmer() query= sys.argv[1] query = p.stem(query,0,len(query)-1) lBytes = search(query,0,int(m1.size()/35),m1) print("lBytes : ",lBytes) if(lBytes != None): termFile = "./indexes/terms.txt" with open(termFile, "r+b") as f: # memory-map the file, size 0 means whole file map = mmap.mmap(f.fileno(), 0) print("Term stuff : ",map[int(lBytes[0]):int(lBytes[1])]) else: print("Not found")
def __init__(self): _LanguageSpecificStemmer.__init__(self) PorterStemmer.__init__(self)
def __init__(self): self.p = PorterStemmer() self.sw = stopwords.StopWords(self.stopword_file) self.re_tag = RE_TAG self.index = None
dictionarySo={} dictionaryProbComPo={} weightPo=0 weightSo=0 dictionaryProbIndPo={} dictionaryProbIndSo={} removeWord=[] #read in training data lines from files, and stopwords (useless words) f=open("training.txt"); v=open("test.txt"); dataFile = open("temp1.txt", "w") comDataFile = open("com.txt", "w") testFile = open("output.txt", "w") stopWord=open("stopwords.txt").read() stopWord=stopWord.split("\n") stemmer= PorterStemmer() countPo=0 countSo=0 trainingSet=f.readlines() testingSet=v.readlines() trainingSet=trainingSet testingSet=testingSet #initialize the stemmer object for (optional) stemming later stemmer= PorterStemmer() stopWord=stemmer.stem(stopWord,0,len(stopWord)-1) def getCleanString(string): """ fix the string for best results the cleaning involve
dataset = pd.read_csv('Restaurant_Reviews.tsv', delimiter='\t') print("dataset imported") import re import nltk nltk.download('stopwords') # to remove stopword from nltk.corpus import stopwords # for Stemming propose #from nltk.stem.porter import PorterStemmer from porter import PorterStemmer p = PorterStemmer() p.stem("Alcoholic") # Initialize empty array # to append clean text corpus = [] for i in range(0, 1000): # column : "Review", row ith review = re.sub('[^a-zA-Z]', ' ', dataset['Review'][i]) # convert all cases to lower cases review = review.lower() review = review.split() ps = PorterStemmer() # loop for stemming each word # in string array at ith row
def __init__(self): self._stemmer = PorterStemmer()
def train(): poslines = [] neglines = [] stopwords= open(r'stopwords.txt', 'r').read().splitlines() dataset= open('training_set.csv', 'r',encoding="utf8") dataset.readline() poslines=[] neglines=[] for data in dataset: data.lower() datalines = data.split(",")[1].strip('"').split(' ') DataClass = data.split(",")[0] #tokenizing the sentence if int(DataClass)==0: poslines.append(datalines) if int(DataClass)==1: neglines.append(datalines) else: continue print( "The total positive words are:", len(poslines)) print ("The total negative words are: ", len(neglines)) poslineedited = [] neglinesedited = [] #there are total 6397 positives and negatives. poslinesTrain= poslines[:3201] neglinesTrain= neglines[:3196] priorknowledgepo = [] priorknowledgeneg = [] priorknowledgeneg= 3196/ 6397 priorknowledgepo = 3201/ 6397 stemmer = PorterStemmer() model = open('model_file.csv', 'w',encoding="utf8") trainset= [(x,1) for x in poslinesTrain] + [(x,-1) for x in neglinesTrain] poswords={} #this dictionary stores counts for every word in positives negwords={} #and negatives for line,label in trainset: words= getwords(line) for word in words: word.lower() #increment the counts for this word based on the label #the .get(x, 0) method returns the current count for word #x, of 0 if the word is not yet in the dictionary if label==1: poswords[word]= poswords.get(word, 0) + 1 if label==-1: negwords[word]= negwords.get(word, 0) + 1 positivewordlist = open(r'positive-words.txt', 'r').read().splitlines() negativewordlist = open(r'negative-words.txt', 'r').read().splitlines() #evaluate the test set testset= open('test_set.csv', 'r',encoding="utf8") testset.readline() #make predictions output = open("prediction_file.csv", 'w') for line in testset: linesplit = line.split() testwords= getwords(linesplit) totpos, totneg= 0.0, 0.0 for word in testwords: word.lower() a= poswords.get(word,0.0) + 1.0 b= negwords.get(word,0.0) + 1.0 totpos+= a/(a+b) totneg+= b/(a+b) model.write("Word: " +str(word) + ",") model.write("Relative positive usage: " + str(totpos)+ ",") model.write("Relative negative usage: "+str(totneg)+ '\n')
def __init__(self, documents, NDC, stop_words): super(Pepper, self).__init__() self.documents = documents self.NDC = NDC self.p = PorterStemmer() self.stop_words = stop_words
def processEmail(email_contents): #PROCESSEMAIL preprocesses a the body of an email and #returns a list of word_indices # word_indices = PROCESSEMAIL(email_contents) preprocesses # the body of an email and returns a list of indices of the # words contained in the email. # # Load Vocabulary vocab = getVocabDict() # Init return value word_indices = [] # ========================== Preprocess Email =========================== # Find the Headers ( \n\n and remove ) # Uncomment the following lines if you are working with raw emails with the # full headers # hdrstart = email_contents.find('\n\n') # email_contents = email_contents[hdrstart+2:] # Lower case email_contents = email_contents.lower() # Strip all HTML # Looks for any expression that starts with < and ends with > # and does not have any < or > in the tag and replace it with a space email_contents = re.sub('<[^<>]+>', ' ', email_contents) # Handle Numbers # Look for one or more characters between 0-9 email_contents = re.sub('[0-9]+', 'number', email_contents) # Handle URLS # Look for strings starting with http:// or https:// email_contents = re.sub('(http|https)://[^\s]*', 'httpaddr', email_contents) # Handle Email Addresses # Look for strings with @ in the middle email_contents = re.sub('[^\s]+@[^\s]+', 'emailaddr', email_contents) # Handle $ sign email_contents = re.sub('[$]+', 'dollar', email_contents) # ========================== Tokenize Email =========================== # Output the email to screen as well print '\n==== Processed Email ====\n' # Process file l = 0 porterStemmer = PorterStemmer() # Tokenize and also get rid of any punctuation sep = '[ \@\$\/\#\.\-\:\&\*\+\=\[\]\?\!\(\)\{\},\'\"\>\_\<\;\%\n\r]+' for s in re.split(sep, email_contents): # Remove any non alphanumeric characters s = re.sub('[^a-zA-Z0-9]', '', s) # Stem the word s = porterStemmer.stem(s.strip()) # Skip the word if it is too short if len(s) < 1: continue # Look up the word in the dictionary and add to word_indices if # found # ====================== YOUR CODE HERE ====================== # Instructions: Fill in this function to add the index of str to # word_indices if it is in the vocabulary. At this point # of the code, you have a stemmed word from the email in # the variable s. You should look up s in the # vocabulary dictionary (vocab). If a match exists, you # should add the index of the word to the word_indices # vector. Concretely, if s = 'action', then you should # add to word_indices the value under the key 'action' # in vocab. For example, if vocab['action'] = 18, then, # you should add 18 to the word_indices vector # (e.g., word_indices.append(18) ). # # ============================================================= # Print to screen, ensuring that the output lines are not too long if l + len(s) + 1 > 78: print l = 0 print s, l += len(s) + 1 # Print footer print '\n=========================' return array(word_indices)