Example #1
0
    def read_emails(self, path):
        # Get all files
        files = [f for f in listdir(path) if isfile(join(path, f))]

        try:
            del (files[files.index('DS_Store')])
        except:
            pass

        reader = WordListCorpusReader(path, files)

        cleaner = Cleaner()

        emails = list()

        # Creates the Email Object out of each email file and appends to list
        for file_id in reader.fileids():
            with open(path + file_id, 'r') as current_file:
                cleaned_contents = cleaner.clean_file(current_file.read())
                split_email_header, split_email_body, split_email_file_id = self.divide(
                    cleaned_contents, file_id)
                emails.append(
                    Email(split_email_header, split_email_body,
                          split_email_file_id))

        # Return list of Email objects
        return emails
Example #2
0
    def get_text(self):
        words = self.plainTextEdit.toPlainText()
        cleaner = Cleaner()
        words2 = cleaner.edit_bulk_comments(words)

        for item in words2:
            self.textEdit.append(item)
Example #3
0
    def decide(self):
        # default messge to broadcast in case anything is required?
        print('------------', 'AGENT: ', self.name, '---------------')
        self.message = {}
        self.action = action.idle()
        # Conditions to Find Grid Size
        if self.grid_size < 0:
            FindGridSizeMind.run(self)
        if self.grid_size > 0:
            # order by inverse precedence (most important last (for any classes that affect actions)
            # keeps the current state of the map and the
            # age (number fo cycles since update)
            MappingMind.run(self)
            # Places a value on each cell by how long ago it was explored and cubes it (older exponentially more expensive)
            # and if there are other agents that are closer there is a penalty.
            # sums each cell to the value of it's surrounding cells (how much does the robot wanna go there)
            # Robot targets the closest most expensive cell
            GreedyExplore.run(self)
            # If there is a cell to clean, checks if there are other bots
            # closer that are able to clean it, if there are then abandons cleaning
            Cleaner.run(self)
            # Resolves a face to face argument, forces the next to moves to be a turn and
            # forward to the right if possible.
            # Follower.run(self
            Plunger.run(self)
            # goes to the specified self.target_position, viea the fewest
            # possible moves, prioritises x first then y.
            GoToPosition.run(self)

        return self.validate_actions()
 def test_cleaner_birthday_Invalid_3_response2(self):
     clean = Cleaner()
     test_data = "23-11-99"
     expected_result = "The year needs to be in the full format eg: 2009"
     actual_result = clean.Clean_Birthday(test_data)[1]
     self.assertEqual(actual_result, expected_result,
                      "actaul_result should equal" + str(expected_result))
 def test_cleaner_birthday_Invalid_3_response1(self):
     clean = Cleaner()
     test_data = "hello-break-me"
     expected_result = None
     actual_result = clean.Clean_Birthday(test_data)[0]
     self.assertEqual(actual_result, expected_result,
                      "actaul_result should equal" + str(expected_result))
 def test_cleaner_birthday_valid_2(self):
     clean = Cleaner()
     test_data = "25-11-1991"
     expected_result = "25-11-1991"
     actual_result = clean.Clean_Birthday(test_data)[0]
     self.assertEqual(actual_result, expected_result,
                      "actaul_result should equal" + str(expected_result))
 def test_cleaner_age_invalid(self):
     clean = Cleaner()
     test_data = "nine"
     expected_result = None
     actual_result = clean.Clean_Age(test_data)[0]
     self.assertEqual(actual_result, expected_result,
                      "actaul_result should equal" + str(expected_result))
 def test_cleaner_age_valid_Int(self):
     clean = Cleaner()
     test_data = 99
     expected_result = 99
     actual_result = clean.Clean_Age(test_data)[0]
     self.assertEqual(actual_result, expected_result,
                      "actaul_result should equal" + str(expected_result))
Example #9
0
 def __init__(self, dirPath, binsNum):
     self.binsNum = binsNum
     self.dirPath = dirPath
     self.m_estimate = 2
     self.loadStructure()
     try:
         self.df = pd.read_csv(self.dirPath + "/train.csv")
     except IOError:
         tkMessageBox.showerror(
             "Naive Bayes Classifier - Error",
             "There is a problem with open " + self.dirPath + "/train.csv")
     self.cleaner = Cleaner(self)
     self.naiveBases = {}  #attributeValue and Classification to NaiveBase
     self.cProb = {}
     for (i, record) in self.df.iterrows():
         recordDic = record.to_dict()
         for attribute in recordDic:
             value = recordDic[attribute]
             c = recordDic["class"]
             n_c = len(self.df.loc[((self.df[attribute] == value) &
                                    (self.df["class"] == c))].index)
             n = len(self.df.loc[(self.df["class"] == c)].index)
             m = self.m_estimate
             M = len(self.structure[attribute])
             p = float(1) / M
             naiveBase = float(n_c + m * p) / (n + m)
             self.naiveBases[attribute + str(value) + c] = naiveBase
     for c in self.structure["class"]:
         self.cProb[c] = float(
             len(self.df.loc[(self.df["class"] == c)].index)) / len(
                 self.df.index)
     tkMessageBox.showinfo("Naive Bayes Classifier - Success",
                           "Building classifier using train-set is done!")
Example #10
0
    def run(self):

        #AUTHENTICATION
        if self.auth():
            print(self.DASH)
            print('Reddit Authentication Successful.\nWelcome, {}'.format(
                self.REDDIT_USERNAME))

        #Get a Subreddit randomly from the pool
        r = random.randint(0, len(self.SUBREDDITS) - 1)
        subreddit = self.getSubreddit(self.SUBREDDITS[r])

        #Get posts of the required subreddit
        posts = self.getSubInfo(subreddit)

        print(self.DASH)
        print("Starting uploads...")
        print(self.DASH)

        self.postToIG(posts)

        print('Uploaded {} posts'.format(self.COUNT))

        print('Starting cleaner..')
        Cleaner.clean()
Example #11
0
    def test_cleaner_bmi(self):
        clean = Cleaner()
        test_data = 'normal'
        expected_result = 'Normal'

        actual_result = clean.clean_bmi(test_data)

        self.assertEqual(actual_result, expected_result, "actaul_result should equil" + expected_result)
Example #12
0
    def test_cleaner_bmi_3(self):
        clean = Cleaner()
        test_data = 'overweight'
        expected_result = 'Overweight'

        actual_result = clean.clean_bmi(test_data)

        self.assertEqual(actual_result, expected_result, "actaul_result should equil" + expected_result)
Example #13
0
    def test_cleaner_bmi_4(self):
        clean = Cleaner()
        test_data = 'OBEsity'
        expected_result = 'Obesity'

        actual_result = clean.clean_bmi(test_data)

        self.assertEqual(actual_result, expected_result, "actaul_result should equil" + expected_result)
Example #14
0
    def test_cleaner_gender_4(self):
        clean = Cleaner()
        test_data = 'f'
        expected_result = 'F'

        actual_result = clean.clean_gender(test_data)

        self.assertEqual(actual_result, expected_result, "actaul_result should equil" + expected_result)
Example #15
0
    def test_cleaner_bmi_2(self):
        clean = Cleaner()
        test_data = 'UNDERWEIGHT'
        expected_result = 'Underweight'

        actual_result = clean.clean_bmi(test_data)

        self.assertEqual(actual_result, expected_result, "actaul_result should equil" + expected_result)
Example #16
0
    def test_cleaner_empid_2(self):
        clean = Cleaner()
        test_data = 'a102'
        expected_result = 'A102'

        actual_result = clean.clean_empid(test_data)

        self.assertEqual(actual_result, expected_result, "actaul_result should equil" + expected_result)
Example #17
0
    def __init__(self, cfg_params):
        """
        constructor
        """
        Cleaner.__init__(self, cfg_params)
        self.cfg_params = cfg_params

        # init client server params...
        CliServerParams(self)
        return
Example #18
0
    def __init__(self, cfg_params):
        """
        constructor
        """
        Cleaner.__init__(self, cfg_params)
        self.cfg_params = cfg_params

        # init client server params...
        CliServerParams(self)
        return
Example #19
0
 def execute(self):
     from pandas import read_csv, merge
     from os.path import join
     from Cleaner import Cleaner
     
     train = read_csv(join(self.directory, 'train.csv'))
     store = read_csv(join(self.directory, 'store.csv'))        
     train = merge(train, store, how='left', on='Store')
     
     train = Cleaner(train)
     train.extractDate()
     
     self.trainingSet = train.data
 def __init__(self, url):
     self.res_dict = {
         'Title': [],
         'Content': [],
         'Title + Content': [],
         'URL': [],
         'ID': []
     }
     self.url = url
     self.sg = SearchGenerator(self.url)
     self.search_terms = np.asarray(self.sg.extract_keywords())
     self.df = ''
     self.cleaner = Cleaner()
class ThreadScraper:
    def __init__(self, url):
        self.res_dict = {
            'Title': [],
            'Content': [],
            'Title + Content': [],
            'URL': [],
            'ID': []
        }
        self.url = url
        self.sg = SearchGenerator(self.url)
        self.search_terms = np.asarray(self.sg.extract_keywords())
        self.df = ''
        self.cleaner = Cleaner()

    def get_submissions(self, term):
        submissions = self.sg.get_reddit().subreddit(
            str(self.sg.get_subreddit())).search(term[0],
                                                 time_filter='year',
                                                 syntax='plain')
        for sub in submissions:
            title = sub.title
            content = sub.selftext
            url = sub.url
            id = sub.id
            if not (url.endswith(".jpg")) and not (
                    url.endswith(".png")
            ) and not (url.endswith(".gif")) and len(content) > 50 and (
                    'http' not in content) and (id not in self.res_dict['ID']):
                self.res_dict['Title'].append(
                    self.cleaner.clean_text(title).split())
                self.res_dict['Content'].append(
                    self.cleaner.clean_text(content).split())
                self.res_dict['Title + Content'].append(
                    self.cleaner.clean_text(title + ' ' + content).split())
                self.res_dict['URL'].append(url)
                self.res_dict['ID'].append(id)

    def export_submission(self):
        with concurrent.futures.ThreadPoolExecutor(8) as executor:
            executor.map(self.get_submissions, self.search_terms)
        df = pd.DataFrame(self.res_dict)
        df.dropna(inplace=True)
        df.reset_index()
        self.df = df
        if not os.path.exists('data'):
            os.makedirs('data')
        print("Writing to CSV")
        df.to_csv('data/results.csv')
        print("Done...")
        return df
Example #22
0
 def __init__(self, queryFile, queryJSON):
     #Initialize the cleaner object
     self._cleaner = Cleaner(" ", " ")
     #txt file in which all queries are stored
     self._qFile = queryFile
     #json file to store the queries after cleaning
     self._qJson = queryJSON
     #list to store raw queries
     self._queryList = list()
     #list to store refined queries
     self._queryDict = dict()
     #stopList
     self._stopList = list()
     #QueryID initialized to 1
     self._qID = 1
Example #23
0
 def get_recommendations(self):
     cleaner = Cleaner()
     sg=SearchGenerator(self.url)
     words = self.dictionary.doc2bow(sg.get_cleancontent().split())
     print("Top words identified: ")
     for word in words:
         print("{} {}".format(word[0], self.dictionary[word[0]]))
     query_vector = self.lda[words]
     sims = self.get_similarity(self.lda, query_vector)
     sims = sorted(enumerate(sims), key=lambda item: -item[1])
     idx = 0
     pids = []
     result = 10
     recommendation=[]
     page_ids = self.df['ID'].to_list()
     print("\nCheck out the links below:")
     while result > 0:
         pageid = page_ids[sims[idx][0]]
         if pageid not in pids:
             pids.append(pageid)
             print("{}".format(self.df[self.df['ID']==pageid]['URL'].values[0]))
             recommendation.append(self.df[self.df['ID']==pageid]['URL'].values[0])
             result -= 1
         idx += 1
     return recommendation
Example #24
0
 def __init__(self, localDownloadQueue="PendingDownloadQueue"):
     Base.__init__(self)
     self.download_queue = localDownloadQueue
     self.ftp_sync = FileSyncer()
     self.move_file_into_processing()
     Extractor(self.local_directory_to_sync)
     Cleaner(self.local_directory_to_sync)
class Content:
    def __init__(self, df, url):
        self.df = df
        self.cleaner = Cleaner()

    def clean_frame(self):
        self.df = self.df[[
            'Title', 'Content'
        ]].apply(lambda x: self.cleaner.clean_text(x).split())
Example #26
0
class Validator(object):

    clean = Cleaner()

    def val_empid(self, data):
        data = self.clean.clean_empid(data)
        if len(data) == 4:
            if data[0].isalpha():
                pass
            for x in data[1]:
                if x.isdigit():
                    pass
                else:
                    return False
            return True
        else:
            return False

    def val_gender(self, data):
        data = self.clean.clean_gender(data)
        if data == "M"or data == "F":
            return True
        else:
            return False

    def val_age(self, data):
        self.clean.clean_age(data)
        return True

        def Validate_Sales(self, Given_Sales):
        #check if the sales within range
        pattern = re.compile(r'\d{3}')
        if pattern.match(Given_Sales):
            return True
        else:
            ValueError as e
            return Given_Sales, e

    def val_bmi(self, data):
        data = self.clean.clean_bmi(data)
        if data == 'Normal' or data == 'Overweight' or data == 'Obesity' or data == 'Underweight':
            return True
        else:
            return False

    def Validate_Salary(self, Given_Salary):
    pattern = re.compile(r'[0-9]{2,3}')
    if pattern.match(Given_Salary):
        try:
            return True
        except ValueError as e:
            return Given_Salary, e

    def val_birthday(self, data):
        self.clean.clean_birthday(data)
        return True
Example #27
0
    def get_important_words(self, emails, path=None):

        cleaner = Cleaner()

        complete_email_text = ''

        for email in emails:
            email_header = cleaner.delete_tags(email.header)
            email_body = cleaner.delete_tags(email.body)

            topic_line = re.findall(r'Topic.*\n', email_header)[0]
            topic_line = topic_line[6:].strip()

            complete_email_text = complete_email_text + topic_line + '\n' + email_body + '\n'

        # Cleaning the text
        complete_email_text = re.sub('\n', ' ', complete_email_text)
        complete_email_text = re.sub('\s', ' ', complete_email_text)
        complete_email_text = re.sub(' +', ' ', complete_email_text)

        complete_email_text = tb(complete_email_text)
        bloblist = [complete_email_text]

        words = []

        # Test
        # print(bloblist)

        for i, blob in enumerate(bloblist):
            scores = {word: self.tfidf(word, blob, bloblist) for word in blob.words}
            sorted_words = sorted(scores.items(), key=lambda x: x[1], reverse=True)
            for word, score in sorted_words:
                words.append(word)

            # Delete Stop-Words
            words = self.delete_stopwords(words)

            if path is not None:
                with open(path, 'w') as current_file:
                    for word in words:
                        current_file.write('{}\n'.format(word))

        return words
Example #28
0
    def __init__(self):
        self.grid_size = -1
        self.message = {}
        self.action = action.idle()
        self.observation = []
        self.position = (-1, -1)
        self.colour = 'none'
        self.orientation = 'none'
        self.dirt = 'none'
        self.name = 'none'
        # MESSAGES
        # LOAD THE MESSAGES
        self.messages = []

        FindGridSizeMind.__init__(self)
        # MAPPING MAP MAX BANDWIDTH OF 80
        MappingMind.__init__(self)
        GreedyExplore.__init__(self)
        Cleaner.__init__(self)
        GoToPosition.__init__(self)
        Plunger.__init__(self)
Example #29
0
    def buildTreesAndDics(self, text):
        tic = time()

        for i in range(self.max_len):
            n_gram_list = sum(
                map(lambda x: Cleaner.n_gram(x, i + 1), text), [])
            self.len_dict[i + 1] = len(n_gram_list)
            if i >= 1:
                self.vocabulary.extend(list(set(n_gram_list)))
            for word in n_gram_list:
                self.prefixTree.insert(word, i + 1)
                self.suffixTree.insert(word, i + 1)
            sys.stdout.write('build tree done %d/%d\r' % (i, self.max_len))
Example #30
0
def readline(raw_line, buf):
    '''load data in 'fields' and 'types' buffer to be transformed later ... '''
    line = raw_line.strip()

    if line.startswith('#fields'):
        fields = raw_line.split('\t')
        fields[len(fields) - 1] = fields[len(fields) - 1].rstrip()
        fields = Cleaner.replace(fields)
        buf['fields'] = fields

    if line.startswith('#types'):
        types = raw_line.split('\t')
        types[len(types) - 1] = types[len(types) - 1].rstrip()
        buf['types'] = types
Example #31
0
    def buildTreesAndDics(self, text):
        tic = time()

        pbar = tqdm(range(self.max_len))
        for i in pbar:
            pbar.set_description("buildTreesAndDics, %d-gram \n" % (i + 1))
            n_gram_list = sum(map(lambda x: Cleaner.n_gram(x, i + 1), text),
                              [])
            self.len_dict[i + 1] = len(n_gram_list)
            if i >= 1:
                self.vocabulary.extend(list(set(n_gram_list)))
            for word in n_gram_list:
                self.prefixTree.insert(word, i + 1)
                self.suffixTree.insert(word, i + 1)
        print("build tree done! %.2fs" % (time() - tic))
Example #32
0
    def __init__(self, rfpath, max_len=4):
        self.prefixTree = Trie()
        self.suffixTree = Trie(direction='suffix')

        self.vocabulary = []
        self.len_dict = dict()
        # 想要计n个字的词必须用n+1-gram
        self.max_len = max_len + 1

        text = Cleaner.preprocess_text(rfpath)
        self.buildTreesAndDics(text)
        self.prefixTree.set_entropy()
        self.suffixTree.set_entropy()

        self.words = dict()