Esempio n. 1
0
    def run(self):

        #AUTHENTICATION
        if self.auth():
            print(self.DASH)
            print('Reddit Authentication Successful.\nWelcome, {}'.format(
                self.REDDIT_USERNAME))

        #Get a Subreddit randomly from the pool
        r = random.randint(0, len(self.SUBREDDITS) - 1)
        subreddit = self.getSubreddit(self.SUBREDDITS[r])

        #Get posts of the required subreddit
        posts = self.getSubInfo(subreddit)

        print(self.DASH)
        print("Starting uploads...")
        print(self.DASH)

        self.postToIG(posts)

        print('Uploaded {} posts'.format(self.COUNT))

        print('Starting cleaner..')
        Cleaner.clean()
Esempio n. 2
0
from Analyzer import Analyzer
from Cleaner import Cleaner
from Cluster import Cluster
from Crawler import Crawler
from Uploader import Uploader

this_date = time.strftime("%Y%m%d", time.localtime())
# 爬取新闻
crawler = Crawler(this_date=this_date)
crawler.crawl()

# 聚类
cluster = Cluster(date=this_date)
cluster.remove_useless_articles()
cluster.load_articles()
cluster.cluster()
cluster.upload_groups_to_DB()

# 情绪分析
analyzer = Analyzer(date=this_date)
analyzer.analyze()

# 上传至LeanCloud
uploader = Uploader(date=this_date)
uploader.upload_new_groups()

# 删除过老或分数过低的新闻组
cleaner = Cleaner(date=this_date)
cleaner.clean()
Esempio n. 3
0
class Classifier:
    def __init__(self, dirPath, binsNum):
        self.binsNum = binsNum
        self.dirPath = dirPath
        self.m_estimate = 2
        self.loadStructure()
        try:
            self.df = pd.read_csv(self.dirPath + "/train.csv")
        except IOError:
            tkMessageBox.showerror(
                "Naive Bayes Classifier - Error",
                "There is a problem with open " + self.dirPath + "/train.csv")
        self.cleaner = Cleaner(self)
        self.naiveBases = {}  #attributeValue and Classification to NaiveBase
        self.cProb = {}
        for (i, record) in self.df.iterrows():
            recordDic = record.to_dict()
            for attribute in recordDic:
                value = recordDic[attribute]
                c = recordDic["class"]
                n_c = len(self.df.loc[((self.df[attribute] == value) &
                                       (self.df["class"] == c))].index)
                n = len(self.df.loc[(self.df["class"] == c)].index)
                m = self.m_estimate
                M = len(self.structure[attribute])
                p = float(1) / M
                naiveBase = float(n_c + m * p) / (n + m)
                self.naiveBases[attribute + str(value) + c] = naiveBase
        for c in self.structure["class"]:
            self.cProb[c] = float(
                len(self.df.loc[(self.df["class"] == c)].index)) / len(
                    self.df.index)
        tkMessageBox.showinfo("Naive Bayes Classifier - Success",
                              "Building classifier using train-set is done!")

    def loadStructure(self):
        try:
            structureFile = open(self.dirPath + "/Structure.txt", "r")
        except IOError:
            tkMessageBox.showerror(
                "Naive Bayes Classifier - Error",
                "There is a problem with open " + self.dirPath +
                "/Structure.txt")
        with structureFile:
            self.structure = {}  # Each attribute and his values;
            for attribute in structureFile:
                attributeParts = attribute.split()
                values = self.getValues(attributeParts)
                self.structure[attributeParts[1]] = values
            structureFile.close()

    def getValues(self, attributeParts):
        if attributeParts[2][0] != '{':
            return [attributeParts[2]]
        else:
            attributeParts = attributeParts[2][1:-1]
            values = attributeParts.split(",")
            return values

    def classify(self):
        if os.path.exists(self.dirPath + "/output.txt"):
            os.remove(self.dirPath + "/output.txt")
        output = open(self.dirPath + "/output.txt", "w+")
        self.test = self.cleaner.clean(pd.read_csv(self.dirPath + "/test.csv"))
        counter = 0
        for (i, record) in self.test.iterrows():
            recordDic = record.to_dict()
            naiveBasesRecord = {}
            for c in self.structure["class"]:
                naiveBasesRecord[c] = self.cProb[c]
                for attribute in recordDic:
                    if not attribute == "class":
                        tmp = attribute + str(recordDic[attribute]) + c
                        tmpValue = self.naiveBases.get(tmp)
                        if not type(tmpValue) == float:
                            tmpValue = 1
                        naiveBasesRecord[c] = naiveBasesRecord[c] * tmpValue
            cMax = self.getMaxClass(naiveBasesRecord)
            output.write(str(counter) + " " + str(cMax) + "\n")
            counter += 1
        output.close()

    def getMaxClass(self, naiveBasesRecord):
        first = True
        for c in naiveBasesRecord:
            if first:
                cMax = c
                first = False
            if naiveBasesRecord[c] == max(naiveBasesRecord[c],
                                          naiveBasesRecord[cMax]):
                cMax = c
        return cMax