def read_emails(self, path): # Get all files files = [f for f in listdir(path) if isfile(join(path, f))] try: del (files[files.index('DS_Store')]) except: pass reader = WordListCorpusReader(path, files) cleaner = Cleaner() emails = list() # Creates the Email Object out of each email file and appends to list for file_id in reader.fileids(): with open(path + file_id, 'r') as current_file: cleaned_contents = cleaner.clean_file(current_file.read()) split_email_header, split_email_body, split_email_file_id = self.divide( cleaned_contents, file_id) emails.append( Email(split_email_header, split_email_body, split_email_file_id)) # Return list of Email objects return emails
def get_text(self): words = self.plainTextEdit.toPlainText() cleaner = Cleaner() words2 = cleaner.edit_bulk_comments(words) for item in words2: self.textEdit.append(item)
def decide(self): # default messge to broadcast in case anything is required? print('------------', 'AGENT: ', self.name, '---------------') self.message = {} self.action = action.idle() # Conditions to Find Grid Size if self.grid_size < 0: FindGridSizeMind.run(self) if self.grid_size > 0: # order by inverse precedence (most important last (for any classes that affect actions) # keeps the current state of the map and the # age (number fo cycles since update) MappingMind.run(self) # Places a value on each cell by how long ago it was explored and cubes it (older exponentially more expensive) # and if there are other agents that are closer there is a penalty. # sums each cell to the value of it's surrounding cells (how much does the robot wanna go there) # Robot targets the closest most expensive cell GreedyExplore.run(self) # If there is a cell to clean, checks if there are other bots # closer that are able to clean it, if there are then abandons cleaning Cleaner.run(self) # Resolves a face to face argument, forces the next to moves to be a turn and # forward to the right if possible. # Follower.run(self Plunger.run(self) # goes to the specified self.target_position, viea the fewest # possible moves, prioritises x first then y. GoToPosition.run(self) return self.validate_actions()
def test_cleaner_birthday_Invalid_3_response2(self): clean = Cleaner() test_data = "23-11-99" expected_result = "The year needs to be in the full format eg: 2009" actual_result = clean.Clean_Birthday(test_data)[1] self.assertEqual(actual_result, expected_result, "actaul_result should equal" + str(expected_result))
def test_cleaner_birthday_Invalid_3_response1(self): clean = Cleaner() test_data = "hello-break-me" expected_result = None actual_result = clean.Clean_Birthday(test_data)[0] self.assertEqual(actual_result, expected_result, "actaul_result should equal" + str(expected_result))
def test_cleaner_birthday_valid_2(self): clean = Cleaner() test_data = "25-11-1991" expected_result = "25-11-1991" actual_result = clean.Clean_Birthday(test_data)[0] self.assertEqual(actual_result, expected_result, "actaul_result should equal" + str(expected_result))
def test_cleaner_age_invalid(self): clean = Cleaner() test_data = "nine" expected_result = None actual_result = clean.Clean_Age(test_data)[0] self.assertEqual(actual_result, expected_result, "actaul_result should equal" + str(expected_result))
def test_cleaner_age_valid_Int(self): clean = Cleaner() test_data = 99 expected_result = 99 actual_result = clean.Clean_Age(test_data)[0] self.assertEqual(actual_result, expected_result, "actaul_result should equal" + str(expected_result))
def __init__(self, dirPath, binsNum): self.binsNum = binsNum self.dirPath = dirPath self.m_estimate = 2 self.loadStructure() try: self.df = pd.read_csv(self.dirPath + "/train.csv") except IOError: tkMessageBox.showerror( "Naive Bayes Classifier - Error", "There is a problem with open " + self.dirPath + "/train.csv") self.cleaner = Cleaner(self) self.naiveBases = {} #attributeValue and Classification to NaiveBase self.cProb = {} for (i, record) in self.df.iterrows(): recordDic = record.to_dict() for attribute in recordDic: value = recordDic[attribute] c = recordDic["class"] n_c = len(self.df.loc[((self.df[attribute] == value) & (self.df["class"] == c))].index) n = len(self.df.loc[(self.df["class"] == c)].index) m = self.m_estimate M = len(self.structure[attribute]) p = float(1) / M naiveBase = float(n_c + m * p) / (n + m) self.naiveBases[attribute + str(value) + c] = naiveBase for c in self.structure["class"]: self.cProb[c] = float( len(self.df.loc[(self.df["class"] == c)].index)) / len( self.df.index) tkMessageBox.showinfo("Naive Bayes Classifier - Success", "Building classifier using train-set is done!")
def run(self): #AUTHENTICATION if self.auth(): print(self.DASH) print('Reddit Authentication Successful.\nWelcome, {}'.format( self.REDDIT_USERNAME)) #Get a Subreddit randomly from the pool r = random.randint(0, len(self.SUBREDDITS) - 1) subreddit = self.getSubreddit(self.SUBREDDITS[r]) #Get posts of the required subreddit posts = self.getSubInfo(subreddit) print(self.DASH) print("Starting uploads...") print(self.DASH) self.postToIG(posts) print('Uploaded {} posts'.format(self.COUNT)) print('Starting cleaner..') Cleaner.clean()
def test_cleaner_bmi(self): clean = Cleaner() test_data = 'normal' expected_result = 'Normal' actual_result = clean.clean_bmi(test_data) self.assertEqual(actual_result, expected_result, "actaul_result should equil" + expected_result)
def test_cleaner_bmi_3(self): clean = Cleaner() test_data = 'overweight' expected_result = 'Overweight' actual_result = clean.clean_bmi(test_data) self.assertEqual(actual_result, expected_result, "actaul_result should equil" + expected_result)
def test_cleaner_bmi_4(self): clean = Cleaner() test_data = 'OBEsity' expected_result = 'Obesity' actual_result = clean.clean_bmi(test_data) self.assertEqual(actual_result, expected_result, "actaul_result should equil" + expected_result)
def test_cleaner_gender_4(self): clean = Cleaner() test_data = 'f' expected_result = 'F' actual_result = clean.clean_gender(test_data) self.assertEqual(actual_result, expected_result, "actaul_result should equil" + expected_result)
def test_cleaner_bmi_2(self): clean = Cleaner() test_data = 'UNDERWEIGHT' expected_result = 'Underweight' actual_result = clean.clean_bmi(test_data) self.assertEqual(actual_result, expected_result, "actaul_result should equil" + expected_result)
def test_cleaner_empid_2(self): clean = Cleaner() test_data = 'a102' expected_result = 'A102' actual_result = clean.clean_empid(test_data) self.assertEqual(actual_result, expected_result, "actaul_result should equil" + expected_result)
def __init__(self, cfg_params): """ constructor """ Cleaner.__init__(self, cfg_params) self.cfg_params = cfg_params # init client server params... CliServerParams(self) return
def execute(self): from pandas import read_csv, merge from os.path import join from Cleaner import Cleaner train = read_csv(join(self.directory, 'train.csv')) store = read_csv(join(self.directory, 'store.csv')) train = merge(train, store, how='left', on='Store') train = Cleaner(train) train.extractDate() self.trainingSet = train.data
def __init__(self, url): self.res_dict = { 'Title': [], 'Content': [], 'Title + Content': [], 'URL': [], 'ID': [] } self.url = url self.sg = SearchGenerator(self.url) self.search_terms = np.asarray(self.sg.extract_keywords()) self.df = '' self.cleaner = Cleaner()
class ThreadScraper: def __init__(self, url): self.res_dict = { 'Title': [], 'Content': [], 'Title + Content': [], 'URL': [], 'ID': [] } self.url = url self.sg = SearchGenerator(self.url) self.search_terms = np.asarray(self.sg.extract_keywords()) self.df = '' self.cleaner = Cleaner() def get_submissions(self, term): submissions = self.sg.get_reddit().subreddit( str(self.sg.get_subreddit())).search(term[0], time_filter='year', syntax='plain') for sub in submissions: title = sub.title content = sub.selftext url = sub.url id = sub.id if not (url.endswith(".jpg")) and not ( url.endswith(".png") ) and not (url.endswith(".gif")) and len(content) > 50 and ( 'http' not in content) and (id not in self.res_dict['ID']): self.res_dict['Title'].append( self.cleaner.clean_text(title).split()) self.res_dict['Content'].append( self.cleaner.clean_text(content).split()) self.res_dict['Title + Content'].append( self.cleaner.clean_text(title + ' ' + content).split()) self.res_dict['URL'].append(url) self.res_dict['ID'].append(id) def export_submission(self): with concurrent.futures.ThreadPoolExecutor(8) as executor: executor.map(self.get_submissions, self.search_terms) df = pd.DataFrame(self.res_dict) df.dropna(inplace=True) df.reset_index() self.df = df if not os.path.exists('data'): os.makedirs('data') print("Writing to CSV") df.to_csv('data/results.csv') print("Done...") return df
def __init__(self, queryFile, queryJSON): #Initialize the cleaner object self._cleaner = Cleaner(" ", " ") #txt file in which all queries are stored self._qFile = queryFile #json file to store the queries after cleaning self._qJson = queryJSON #list to store raw queries self._queryList = list() #list to store refined queries self._queryDict = dict() #stopList self._stopList = list() #QueryID initialized to 1 self._qID = 1
def get_recommendations(self): cleaner = Cleaner() sg=SearchGenerator(self.url) words = self.dictionary.doc2bow(sg.get_cleancontent().split()) print("Top words identified: ") for word in words: print("{} {}".format(word[0], self.dictionary[word[0]])) query_vector = self.lda[words] sims = self.get_similarity(self.lda, query_vector) sims = sorted(enumerate(sims), key=lambda item: -item[1]) idx = 0 pids = [] result = 10 recommendation=[] page_ids = self.df['ID'].to_list() print("\nCheck out the links below:") while result > 0: pageid = page_ids[sims[idx][0]] if pageid not in pids: pids.append(pageid) print("{}".format(self.df[self.df['ID']==pageid]['URL'].values[0])) recommendation.append(self.df[self.df['ID']==pageid]['URL'].values[0]) result -= 1 idx += 1 return recommendation
def __init__(self, localDownloadQueue="PendingDownloadQueue"): Base.__init__(self) self.download_queue = localDownloadQueue self.ftp_sync = FileSyncer() self.move_file_into_processing() Extractor(self.local_directory_to_sync) Cleaner(self.local_directory_to_sync)
class Content: def __init__(self, df, url): self.df = df self.cleaner = Cleaner() def clean_frame(self): self.df = self.df[[ 'Title', 'Content' ]].apply(lambda x: self.cleaner.clean_text(x).split())
class Validator(object): clean = Cleaner() def val_empid(self, data): data = self.clean.clean_empid(data) if len(data) == 4: if data[0].isalpha(): pass for x in data[1]: if x.isdigit(): pass else: return False return True else: return False def val_gender(self, data): data = self.clean.clean_gender(data) if data == "M"or data == "F": return True else: return False def val_age(self, data): self.clean.clean_age(data) return True def Validate_Sales(self, Given_Sales): #check if the sales within range pattern = re.compile(r'\d{3}') if pattern.match(Given_Sales): return True else: ValueError as e return Given_Sales, e def val_bmi(self, data): data = self.clean.clean_bmi(data) if data == 'Normal' or data == 'Overweight' or data == 'Obesity' or data == 'Underweight': return True else: return False def Validate_Salary(self, Given_Salary): pattern = re.compile(r'[0-9]{2,3}') if pattern.match(Given_Salary): try: return True except ValueError as e: return Given_Salary, e def val_birthday(self, data): self.clean.clean_birthday(data) return True
def get_important_words(self, emails, path=None): cleaner = Cleaner() complete_email_text = '' for email in emails: email_header = cleaner.delete_tags(email.header) email_body = cleaner.delete_tags(email.body) topic_line = re.findall(r'Topic.*\n', email_header)[0] topic_line = topic_line[6:].strip() complete_email_text = complete_email_text + topic_line + '\n' + email_body + '\n' # Cleaning the text complete_email_text = re.sub('\n', ' ', complete_email_text) complete_email_text = re.sub('\s', ' ', complete_email_text) complete_email_text = re.sub(' +', ' ', complete_email_text) complete_email_text = tb(complete_email_text) bloblist = [complete_email_text] words = [] # Test # print(bloblist) for i, blob in enumerate(bloblist): scores = {word: self.tfidf(word, blob, bloblist) for word in blob.words} sorted_words = sorted(scores.items(), key=lambda x: x[1], reverse=True) for word, score in sorted_words: words.append(word) # Delete Stop-Words words = self.delete_stopwords(words) if path is not None: with open(path, 'w') as current_file: for word in words: current_file.write('{}\n'.format(word)) return words
def __init__(self): self.grid_size = -1 self.message = {} self.action = action.idle() self.observation = [] self.position = (-1, -1) self.colour = 'none' self.orientation = 'none' self.dirt = 'none' self.name = 'none' # MESSAGES # LOAD THE MESSAGES self.messages = [] FindGridSizeMind.__init__(self) # MAPPING MAP MAX BANDWIDTH OF 80 MappingMind.__init__(self) GreedyExplore.__init__(self) Cleaner.__init__(self) GoToPosition.__init__(self) Plunger.__init__(self)
def buildTreesAndDics(self, text): tic = time() for i in range(self.max_len): n_gram_list = sum( map(lambda x: Cleaner.n_gram(x, i + 1), text), []) self.len_dict[i + 1] = len(n_gram_list) if i >= 1: self.vocabulary.extend(list(set(n_gram_list))) for word in n_gram_list: self.prefixTree.insert(word, i + 1) self.suffixTree.insert(word, i + 1) sys.stdout.write('build tree done %d/%d\r' % (i, self.max_len))
def readline(raw_line, buf): '''load data in 'fields' and 'types' buffer to be transformed later ... ''' line = raw_line.strip() if line.startswith('#fields'): fields = raw_line.split('\t') fields[len(fields) - 1] = fields[len(fields) - 1].rstrip() fields = Cleaner.replace(fields) buf['fields'] = fields if line.startswith('#types'): types = raw_line.split('\t') types[len(types) - 1] = types[len(types) - 1].rstrip() buf['types'] = types
def buildTreesAndDics(self, text): tic = time() pbar = tqdm(range(self.max_len)) for i in pbar: pbar.set_description("buildTreesAndDics, %d-gram \n" % (i + 1)) n_gram_list = sum(map(lambda x: Cleaner.n_gram(x, i + 1), text), []) self.len_dict[i + 1] = len(n_gram_list) if i >= 1: self.vocabulary.extend(list(set(n_gram_list))) for word in n_gram_list: self.prefixTree.insert(word, i + 1) self.suffixTree.insert(word, i + 1) print("build tree done! %.2fs" % (time() - tic))
def __init__(self, rfpath, max_len=4): self.prefixTree = Trie() self.suffixTree = Trie(direction='suffix') self.vocabulary = [] self.len_dict = dict() # 想要计n个字的词必须用n+1-gram self.max_len = max_len + 1 text = Cleaner.preprocess_text(rfpath) self.buildTreesAndDics(text) self.prefixTree.set_entropy() self.suffixTree.set_entropy() self.words = dict()