Example #1
0
 def __init__(self, localDownloadQueue="PendingDownloadQueue"):
     Base.__init__(self)
     self.download_queue = localDownloadQueue
     self.ftp_sync = FileSyncer()
     self.move_file_into_processing()
     Extractor(self.local_directory_to_sync)
     Cleaner(self.local_directory_to_sync)
Example #2
0
 def get_recommendations(self):
     cleaner = Cleaner()
     sg=SearchGenerator(self.url)
     words = self.dictionary.doc2bow(sg.get_cleancontent().split())
     print("Top words identified: ")
     for word in words:
         print("{} {}".format(word[0], self.dictionary[word[0]]))
     query_vector = self.lda[words]
     sims = self.get_similarity(self.lda, query_vector)
     sims = sorted(enumerate(sims), key=lambda item: -item[1])
     idx = 0
     pids = []
     result = 10
     recommendation=[]
     page_ids = self.df['ID'].to_list()
     print("\nCheck out the links below:")
     while result > 0:
         pageid = page_ids[sims[idx][0]]
         if pageid not in pids:
             pids.append(pageid)
             print("{}".format(self.df[self.df['ID']==pageid]['URL'].values[0]))
             recommendation.append(self.df[self.df['ID']==pageid]['URL'].values[0])
             result -= 1
         idx += 1
     return recommendation
 def test_cleaner_age_valid_Int(self):
     clean = Cleaner()
     test_data = 99
     expected_result = 99
     actual_result = clean.Clean_Age(test_data)[0]
     self.assertEqual(actual_result, expected_result,
                      "actaul_result should equal" + str(expected_result))
 def test_cleaner_birthday_Invalid_3_response1(self):
     clean = Cleaner()
     test_data = "hello-break-me"
     expected_result = None
     actual_result = clean.Clean_Birthday(test_data)[0]
     self.assertEqual(actual_result, expected_result,
                      "actaul_result should equal" + str(expected_result))
Example #5
0
    def get_text(self):
        words = self.plainTextEdit.toPlainText()
        cleaner = Cleaner()
        words2 = cleaner.edit_bulk_comments(words)

        for item in words2:
            self.textEdit.append(item)
 def test_cleaner_birthday_Invalid_3_response2(self):
     clean = Cleaner()
     test_data = "23-11-99"
     expected_result = "The year needs to be in the full format eg: 2009"
     actual_result = clean.Clean_Birthday(test_data)[1]
     self.assertEqual(actual_result, expected_result,
                      "actaul_result should equal" + str(expected_result))
 def test_cleaner_birthday_valid_2(self):
     clean = Cleaner()
     test_data = "25-11-1991"
     expected_result = "25-11-1991"
     actual_result = clean.Clean_Birthday(test_data)[0]
     self.assertEqual(actual_result, expected_result,
                      "actaul_result should equal" + str(expected_result))
 def test_cleaner_age_invalid(self):
     clean = Cleaner()
     test_data = "nine"
     expected_result = None
     actual_result = clean.Clean_Age(test_data)[0]
     self.assertEqual(actual_result, expected_result,
                      "actaul_result should equal" + str(expected_result))
Example #9
0
 def __init__(self, dirPath, binsNum):
     self.binsNum = binsNum
     self.dirPath = dirPath
     self.m_estimate = 2
     self.loadStructure()
     try:
         self.df = pd.read_csv(self.dirPath + "/train.csv")
     except IOError:
         tkMessageBox.showerror(
             "Naive Bayes Classifier - Error",
             "There is a problem with open " + self.dirPath + "/train.csv")
     self.cleaner = Cleaner(self)
     self.naiveBases = {}  #attributeValue and Classification to NaiveBase
     self.cProb = {}
     for (i, record) in self.df.iterrows():
         recordDic = record.to_dict()
         for attribute in recordDic:
             value = recordDic[attribute]
             c = recordDic["class"]
             n_c = len(self.df.loc[((self.df[attribute] == value) &
                                    (self.df["class"] == c))].index)
             n = len(self.df.loc[(self.df["class"] == c)].index)
             m = self.m_estimate
             M = len(self.structure[attribute])
             p = float(1) / M
             naiveBase = float(n_c + m * p) / (n + m)
             self.naiveBases[attribute + str(value) + c] = naiveBase
     for c in self.structure["class"]:
         self.cProb[c] = float(
             len(self.df.loc[(self.df["class"] == c)].index)) / len(
                 self.df.index)
     tkMessageBox.showinfo("Naive Bayes Classifier - Success",
                           "Building classifier using train-set is done!")
Example #10
0
    def read_emails(self, path):
        # Get all files
        files = [f for f in listdir(path) if isfile(join(path, f))]

        try:
            del (files[files.index('DS_Store')])
        except:
            pass

        reader = WordListCorpusReader(path, files)

        cleaner = Cleaner()

        emails = list()

        # Creates the Email Object out of each email file and appends to list
        for file_id in reader.fileids():
            with open(path + file_id, 'r') as current_file:
                cleaned_contents = cleaner.clean_file(current_file.read())
                split_email_header, split_email_body, split_email_file_id = self.divide(
                    cleaned_contents, file_id)
                emails.append(
                    Email(split_email_header, split_email_body,
                          split_email_file_id))

        # Return list of Email objects
        return emails
Example #11
0
    def test_cleaner_bmi(self):
        clean = Cleaner()
        test_data = 'normal'
        expected_result = 'Normal'

        actual_result = clean.clean_bmi(test_data)

        self.assertEqual(actual_result, expected_result, "actaul_result should equil" + expected_result)
Example #12
0
    def test_cleaner_bmi_2(self):
        clean = Cleaner()
        test_data = 'UNDERWEIGHT'
        expected_result = 'Underweight'

        actual_result = clean.clean_bmi(test_data)

        self.assertEqual(actual_result, expected_result, "actaul_result should equil" + expected_result)
Example #13
0
    def test_cleaner_bmi_3(self):
        clean = Cleaner()
        test_data = 'overweight'
        expected_result = 'Overweight'

        actual_result = clean.clean_bmi(test_data)

        self.assertEqual(actual_result, expected_result, "actaul_result should equil" + expected_result)
Example #14
0
    def test_cleaner_gender_4(self):
        clean = Cleaner()
        test_data = 'f'
        expected_result = 'F'

        actual_result = clean.clean_gender(test_data)

        self.assertEqual(actual_result, expected_result, "actaul_result should equil" + expected_result)
Example #15
0
    def test_cleaner_empid_2(self):
        clean = Cleaner()
        test_data = 'a102'
        expected_result = 'A102'

        actual_result = clean.clean_empid(test_data)

        self.assertEqual(actual_result, expected_result, "actaul_result should equil" + expected_result)
Example #16
0
    def test_cleaner_bmi_4(self):
        clean = Cleaner()
        test_data = 'OBEsity'
        expected_result = 'Obesity'

        actual_result = clean.clean_bmi(test_data)

        self.assertEqual(actual_result, expected_result, "actaul_result should equil" + expected_result)
Example #17
0
class Validator(object):

    clean = Cleaner()

    def val_empid(self, data):
        data = self.clean.clean_empid(data)
        if len(data) == 4:
            if data[0].isalpha():
                pass
            for x in data[1]:
                if x.isdigit():
                    pass
                else:
                    return False
            return True
        else:
            return False

    def val_gender(self, data):
        data = self.clean.clean_gender(data)
        if data == "M"or data == "F":
            return True
        else:
            return False

    def val_age(self, data):
        self.clean.clean_age(data)
        return True

        def Validate_Sales(self, Given_Sales):
        #check if the sales within range
        pattern = re.compile(r'\d{3}')
        if pattern.match(Given_Sales):
            return True
        else:
            ValueError as e
            return Given_Sales, e

    def val_bmi(self, data):
        data = self.clean.clean_bmi(data)
        if data == 'Normal' or data == 'Overweight' or data == 'Obesity' or data == 'Underweight':
            return True
        else:
            return False

    def Validate_Salary(self, Given_Salary):
    pattern = re.compile(r'[0-9]{2,3}')
    if pattern.match(Given_Salary):
        try:
            return True
        except ValueError as e:
            return Given_Salary, e

    def val_birthday(self, data):
        self.clean.clean_birthday(data)
        return True
 def __init__(self, url):
     self.res_dict = {
         'Title': [],
         'Content': [],
         'Title + Content': [],
         'URL': [],
         'ID': []
     }
     self.url = url
     self.sg = SearchGenerator(self.url)
     self.search_terms = np.asarray(self.sg.extract_keywords())
     self.df = ''
     self.cleaner = Cleaner()
Example #19
0
    def read_file_txt(self, all_my_employees):

        with open("test_data_txt.txt", "r") as file:
            data = file.readlines()
            clean = Cleaner()
            val = Validator()
            for line in data:
                valid = True
                emp = line.split(",")

                empid = clean.clean_empid(emp[0])
                if val.val_empid(all_my_employees, empid)[0] == False:
                    valid = False
                    print("empid")

                gender = clean.clean_gender(emp[1])
                if val.val_gender(gender)[0] == False:
                    valid = False
                    print("gender")

                age = clean.Clean_Age(emp[2])
                if val.Validate_Age(age[0])[0] == False:
                    valid = False
                    print("age")

                sales = emp[3]

                bmi = clean.clean_bmi(emp[4])
                if val.val_bmi(bmi)[0] == False:
                    valid = False
                    print("bmi")

                salary = emp[5]

                # there is an issue with the validation of the test data's birthdays
                birthday = clean.Clean_Birthday(emp[6])
                # if val.Validate_Birthday(birthday, age[0]+ 1 )[0]:
                #     pass
                # else:
                #     valid = False
                #     print("birthday")

                if valid != False:
                    employee = Employee(empid, gender, age[0], sales, bmi,
                                        salary, birthday[0])
                    all_my_employees[empid] = employee
                else:
                    print("Failed to add employee")
        return all_my_employees
Example #20
0
 def __init__(self, queryFile, queryJSON):
     #Initialize the cleaner object
     self._cleaner = Cleaner(" ", " ")
     #txt file in which all queries are stored
     self._qFile = queryFile
     #json file to store the queries after cleaning
     self._qJson = queryJSON
     #list to store raw queries
     self._queryList = list()
     #list to store refined queries
     self._queryDict = dict()
     #stopList
     self._stopList = list()
     #QueryID initialized to 1
     self._qID = 1
Example #21
0
class Controller(object):

    val = Validator()
    clean = Cleaner()

    def test_empid(self):
        data = "a001"
        print(self.clean.clean_empid(data))
        print(self.val.val_empid(self.clean.clean_empid(data)))

    def test_gender(self):
        data = 'lbp'
        print(self.val.val_gender(data))

    def test_bmi(self):
        data = 'normal'
        print(self.clean.clean_bmi(data))
        print(self.val.val_bmi(self.clean.clean_bmi(data)))
Example #22
0
    def get_important_words(self, emails, path=None):

        cleaner = Cleaner()

        complete_email_text = ''

        for email in emails:
            email_header = cleaner.delete_tags(email.header)
            email_body = cleaner.delete_tags(email.body)

            topic_line = re.findall(r'Topic.*\n', email_header)[0]
            topic_line = topic_line[6:].strip()

            complete_email_text = complete_email_text + topic_line + '\n' + email_body + '\n'

        # Cleaning the text
        complete_email_text = re.sub('\n', ' ', complete_email_text)
        complete_email_text = re.sub('\s', ' ', complete_email_text)
        complete_email_text = re.sub(' +', ' ', complete_email_text)

        complete_email_text = tb(complete_email_text)
        bloblist = [complete_email_text]

        words = []

        # Test
        # print(bloblist)

        for i, blob in enumerate(bloblist):
            scores = {word: self.tfidf(word, blob, bloblist) for word in blob.words}
            sorted_words = sorted(scores.items(), key=lambda x: x[1], reverse=True)
            for word, score in sorted_words:
                words.append(word)

            # Delete Stop-Words
            words = self.delete_stopwords(words)

            if path is not None:
                with open(path, 'w') as current_file:
                    for word in words:
                        current_file.write('{}\n'.format(word))

        return words
Example #23
0
    def XML_Reader(self, file_Location, error_File_Location,
                   all_employees):  # Add validate
        MyCleaner = Cleaner()
        my_Employees = {}
        tree = ET.parse(file_Location)
        root = tree.getroot()
        for user in root.findall('user'):
            # Need to Validate all given data
            empid = user.get('EMPID')
            gender = user.find('gender').text
            age = user.find('age').text
            sales = user.find('sales').text
            bmi = user.find('BMI').text
            salary = user.find('salary').text
            birthday = user.find('birthday').text
            #For each item in new Employee need to check that they have a value after being vlaidated
            new_Employee = Employee(MyCleaner.clean_empid(empid),
                                    MyCleaner.clean_gender(gender),
                                    MyCleaner.Clean_Age(age)[0], int(sales),
                                    MyCleaner.clean_bmi(bmi),
                                    int(salary.replace(',', '')),
                                    MyCleaner.Clean_Birthday(birthday)[0])
            my_Employees[new_Employee.my_empid] = new_Employee

            # clean_Data = True
            #
            # for item in [empid, gender, age, sales, bmi, salary, birthday]:
            #     if item[0] == False:
            #         clean_Data = False
            #
            # if(clean_Data):
            #     new_Employee = Employee(MyCleaner.clean_empid(empid), MyCleaner.clean_gender(gender), MyCleaner.Clean_Age(age), MyValidator.Validate_Sales(sales), MyCleaner.clean_bmi(bmi), MyValidator.Validate_Salary(salary), MyCleaner.Clean_Birthday(birthday))
            #     my_Employees[new_Employee.EMPID] = new_Employee
            # else:
            #     #write to errorlogFile
            #     pass

        return my_Employees
    def evaluate_tags(gr_email, pred_email):

        cleaner = Cleaner()

        regex = {
            'time': r'<[s|e]time>.*?</[s|e]time>',
            'speaker': r'<speaker>.*?</speaker>',
            'location': r'<location>.*?</location>',
            'sentence': r'<sentence>.*?</sentence>',
            'paragraph': r'<paragraph>.*?</paragraph>'
        }

        # gr_email tags -----------------
        gr_email_header = gr_email.header
        gr_email_body = gr_email.body
        gr_email = gr_email_header + gr_email_body

        gr_email_tags = {}
        # Clean from 'newlines'
        gr_email = gr_email.replace('\n', '')

        for k in regex.keys():
            gr_email_tags[k] = re.findall(regex[k], gr_email, re.MULTILINE)
            for i in range(0, len(gr_email_tags[k])):
                gr_email_tags[k][i] = cleaner.clean_file(gr_email_tags[k][i])

        # pred_email tags -------------------
        pred_email_header = pred_email.header
        pred_email_body = pred_email.body
        pred_email = pred_email_header + pred_email_body

        pred_email_tags = {}
        # Clean from 'newlines'
        pred_email = pred_email.replace('\n', '')

        for k in regex.keys():
            pred_email_tags[k] = re.findall(regex[k], pred_email, re.M)
            for i in range(0, len(pred_email_tags[k])):
                pred_email_tags[k][i] = cleaner.clean_file(pred_email_tags[k][i])

        tp = 0
        fp = 0
        fn = 0

        # change gr_tags.keys() to ['key'] to evaluate a specific tag
        for k in gr_email_tags.keys():
            gr = gr_email_tags[k]
            pred = pred_email_tags[k]

            # removing all punctuations and spaces from both email tag lists
            for i in range(0, len(gr)):
                gr[i] = re.sub(r'[^\w\s]', '', gr[i])
                gr[i] = re.sub(' ', '', gr[i])

            for i in range(0, len(pred)):
                pred[i] = re.sub(r'[^\w\s]', '', pred[i])
                pred[i] = re.sub(' ', '', pred[i])

            # Calculating TP, FP, FN
            for t in gr:
                # print(t)
                if t in pred:
                    # print("Got here")
                    tp = tp + 1
                    pred.remove(t)
                else:
                    # print("Got here")
                    fn = fn + 1

            fp = fp + len(pred)

        return tp, fp, fn
Example #25
0
from Analyzer import Analyzer
from Cleaner import Cleaner
from Cluster import Cluster
from Crawler import Crawler
from Uploader import Uploader

this_date = time.strftime("%Y%m%d", time.localtime())
# 爬取新闻
crawler = Crawler(this_date=this_date)
crawler.crawl()

# 聚类
cluster = Cluster(date=this_date)
cluster.remove_useless_articles()
cluster.load_articles()
cluster.cluster()
cluster.upload_groups_to_DB()

# 情绪分析
analyzer = Analyzer(date=this_date)
analyzer.analyze()

# 上传至LeanCloud
uploader = Uploader(date=this_date)
uploader.upload_new_groups()

# 删除过老或分数过低的新闻组
cleaner = Cleaner(date=this_date)
cleaner.clean()
Example #26
0
categories = [
    "alt.atheism", "soc.religion.christian", "sci.med", "comp.graphics"
]

cate2 = [
    "comp.graphics", "comp.os.ms-windows.misc", "comp.sys.ibm.pc.hardware",
    "comp.sys.mac.hardware", "comp.windows.x"
]

twenty_train = fetch_20newsgroups(subset="train",
                                  categories=cate2,
                                  shuffle=True)
twenty_test = fetch_20newsgroups(subset="test", categories=cate2, shuffle=True)

#cleaninng data set
truck_cleaner = Cleaner()
truck_cleaner.get_data_category_count(twenty_train)
cleaner_text = truck_cleaner.text_header_remover(twenty_train.data)

#preparing dataset
import nltk
#nltk.download('punkt')
#nltk.download('stopwords')
from gensim.models import Word2Vec
from nltk.corpus import stopwords
#import numpy as np


def tokenizer_helper(cleaner_text_list):
    tokenize_sentences_list = []
    for sentence in cleaner_text_list:
Example #27
0
n_starting_triplets = 0
n_total_triplets = 0

files = [f for f in os.listdir("./processed")]

for x in range(0,len(files)):
	if x > 1:
		print("completed: ", round((x * 100) / len(files),1), "%           ", end='\r')
	file_name =  os.path.join("./processed", files[x])
	try:
		df = pd.read_csv(filepath_or_buffer = file_name, index_col=0, dtype = str, na_filter=False)
		# df = df.drop(["id_df"], axis=1)

		n_starting_triplets += len(df)
		cleaner = Cleaner(df, t5_tokenizer, stopwords, english_cache)

		cleaner.remove_non_marked()
		cleaner.clean_df()

		# final cleaning : remove methods which has more than one review
		cleaner.remove_multiple_method_comments()

		n_irrelevant_comments += cleaner.irrelevant_comments
		n_not_marked += cleaner.not_marked
		n_non_latin += cleaner.non_latin
		n_before_equals_after += cleaner.before_equals_after
		n_non_english += cleaner.non_english
		n_too_long += cleaner.too_long
		n_too_long_after += cleaner.too_long_after
		n_multiple_rev += cleaner.multiple_reviews
Example #28
0
from argparse import ArgumentParser

from Cleaner import Cleaner
from Spanbert import SpanBert
from entitycentric import entity_centric_segmentation

cleaner = Cleaner()
model = SpanBert()


def write(f, text, tar, sent):
    f.write(text)
    f.write("\n")
    f.write(tar)
    f.write("\n")
    f.write(sent)
    f.write("\n")


def segmentation(text, tar):
    text = text.replace("$T$", tar)
    text = cleaner.clean(text)

    clusters = model.predict(text)
    tokens = model.get_tokens()

    return entity_centric_segmentation(clusters,
                                       tokens,
                                       tar,
                                       min_dist=15,
                                       anaphora_only=False,
 def get_cleantext(self, text):
     cleaner = Cleaner()
     cleaned = cleaner.clean_text(text)
     return cleaned
Example #30
0
def testClean():
    cleaner = Cleaner("Eli")
    assert cleaner.clean is "Eli is cleaning"