Esempio n. 1
0
    def run(self):
        regionalEntropy = {}
        regionCount = {}
        timeEntropy = {}
        sumInterest = {}
        posInterest = {}


        ##############################################################################
        # PARSE GOOGLE TREND RESULT FILES
        ##############################################################################
        quota_error = 0
        for file in self.onlyfiles:
            startFromLine = -1
            startFromLineTime = -1

            pos = file.find(".csv")
            filename = file[0:pos]

            linesCounter = 1
            end = False
            endTime = False

            with open(self.datapath+"/"+file) as f:
                content = f.readlines()
                regions = {}
                timeseries = {}
                for line in content:
                    if line.startswith("<div id="):
                        quota_error += 1
                        print "quota error for %s"%filename
                        break;
                    if line.startswith("Region,"):
                        startFromLine = linesCounter
                    if line.startswith("Month,"):
                        startFromLineTime = linesCounter

                    if ((startFromLine > 0) and (linesCounter > startFromLine) and (not end)):
                         if line == "\n":
                            end = True
                         else:
                            items  = line.split(",")
                            regions[items[0]] = items[1]

                    if ((startFromLineTime > 0) and (linesCounter > startFromLineTime) and (not endTime)):
                        print line
                        if line == "\n":
                            endTime = True

                        else:
                            items  = line.split(",")
                            if items[1] == ' \n': # sometimes gtrends returns empty field rather than 0
                                timeseries[items[0]] = "0"
                            else:
                                timeseries[items[0]] = items[1]
                    linesCounter += 1


            timeFrequs = map(int, timeseries.values())
            regionFrequs = map(int,(regions.values()))

            if linesCounter > 1:
                sumInterest[filename] = np.sum(timeFrequs)
                posInterest[filename] = np.count_nonzero(timeFrequs)

                if posInterest[filename] > 0:
                    timeEntropy[filename] = stats.entropy(timeFrequs)
                else:
                    timeEntropy[filename] = np.nan

                regionCount[filename] = len(regionFrequs)
                if(np.sum(regionFrequs) > 0):
                    regionalEntropy[filename] = stats.entropy(regionFrequs)
                else:
                    regionalEntropy[filename] = np.nan


        # store results into a dataframe
        regionalEntropyDF = pd.DataFrame.from_dict(regionalEntropy.items())
        regionalEntropyDF.columns=["filename", "entropy"]
        print regionalEntropyDF.head()

        regionCountDF = pd.DataFrame.from_dict(regionCount.items())
        regionCountDF.columns=["filename", "numRegions"]

        interestDF = pd.DataFrame.from_dict(sumInterest.items())
        interestDF.columns=["filename", "timeInterest"]

        timeEntropyDF = pd.DataFrame.from_dict(timeEntropy.items())
        timeEntropyDF.columns=["filename", "timeEntropy"]

        posInterestDF = pd.DataFrame.from_dict(posInterest.items())
        posInterestDF.columns=["filename", "timePosInterest"]


        #print "regionalEntropyDF"
        #print regionalEntropyDF.head(n=1)
        #print regionalEntropyDF.shape
        #print "regionCountDF"
        #print regionCountDF.head(n=1)
        #print regionCountDF.shape
        #print "self.people"
        #print self.people.head(n=1)
        #print self.people.shape

        # add the computed statistics to the people file
        self.people = self.people.merge(regionalEntropyDF, right_on="filename", left_on="filename", how="inner")
        self.people = self.people.merge(regionCountDF, right_on="filename", left_on="filename", how="inner")
        self.people = self.people.merge(timeEntropyDF, right_on="filename", left_on="filename", how="inner")
        self.people = self.people.merge(interestDF, right_on="filename", left_on="filename", how="inner")
        self.people = self.people.merge(posInterestDF, right_on="filename", left_on="filename", how="inner")


        print "AFTER MERGING"
        print self.people.head(n=1)
        print self.people.shape




        ##############################################################################
        # PLOTS NUM REGIONS
        ##############################################################################

        men = self.people[self.people.gender =="male"]
        women = self.people[self.people.gender =="female"]

        labels = ['female ('+str(len(women.index))+')', 'male ('+str(len(men.index))+')']
        data = [women.numRegions.values, men.numRegions.values]

        self.logfile.write("\n Mann Withney U Num regions:")
        U, p =  stats.mstats.mannwhitneyu(women.numRegions.values, men.numRegions.values)
        ut.write_mannwithneyu(U, p, women.numRegions.values, men.numRegions.values, self.logfile)
        self.make_boxplot(data, labels, self.imgpath+"gtrend_num_regions_box.png", "num regions")
        self.plot_ccdf(np.array(women.numRegions.values.tolist()), np.array(men.numRegions.values.tolist()), labels, self.imgpath+"gtrend_num_regions_ccdf.png", "Num Regions", 1, 0)
        ut.plot_facet_dist(self.people, 'gender', 'numRegions', self.imgpath+"gtrend_num_regions.png")
        ut.rank_size_plot(self.people, 'numRegions', 'Num Regions Gtrends',  self.imgpath+"gtrend_num_regions_ranksize.png")

        ##############################################################################
        # PLOTS TOTAL INTEREST
        ##############################################################################

        data = [women.timeInterest.values, men.timeInterest.values]
        self.logfile.write("\n \n Mann Withney U Temp Sum Interest:")
        U, p =  stats.mstats.mannwhitneyu(women.timeInterest.values, men.timeInterest.values)
        ut.write_mannwithneyu(U, p, women.timeInterest.values, men.timeInterest.values, self.logfile)
        self.make_boxplot(data, labels, self.imgpath+"gtrend_time_interest_box.png", "sum interest")
        self.plot_ccdf(np.array(women.timeInterest.values.tolist()), np.array(men.timeInterest.values.tolist()), labels, self.imgpath+"gtrend_time_interest_ccdf.png", "Sum Interest", 1, 0)
        ut.plot_facet_dist(self.people, 'gender', 'timeInterest', self.imgpath+"gtrend_time_interest.png")

        data = [women.timePosInterest.values, men.timePosInterest.values]
        self.logfile.write("\n\n Mann Withney U Temp Pos Interest:")
        U, p =  stats.mstats.mannwhitneyu(women.timePosInterest.values, men.timePosInterest.values)
        ut.write_mannwithneyu(U, p, women.timePosInterest.values, men.timePosInterest.values, self.logfile)
        self.make_boxplot(data, labels, self.imgpath+"gtrend_time_pos_interest_box.png", "num weeks with interest")
        self.plot_ccdf(np.array(women.timePosInterest.values.tolist()), np.array(men.timePosInterest.values.tolist()), labels, self.imgpath+"gtrend_time_pos_interest_ccdf.png", "Num weeks with interest", 1, 0)
        ut.plot_facet_dist(self.people, 'gender', 'timePosInterest', self.imgpath+"gtrend_time_pos_interest.png")


        ##############################################################################
        # PLOT Entropy Temp INTEREST
        ##############################################################################
        limPeople = self.people[np.isfinite(self.people['timeEntropy'])] #people[people.index not in inds]
        men = limPeople[limPeople.gender =="male"]
        women = limPeople[limPeople.gender =="female"]
        data = [women.timeEntropy.values, men.timeEntropy.values]
        self.logfile.write("\n\n Mann Withney U Time Entropy:")
        U, p =  stats.mstats.mannwhitneyu(women.timeEntropy.values, men.timeEntropy.values)
        ut.write_mannwithneyu(U, p, women.timeEntropy.values, men.timeEntropy.values, self.logfile)
        self.make_boxplot(data, labels, self.imgpath+"gtrend_time_entropy_box.png", "temporal entropy")
        self.plot_ccdf(np.array(women.timeEntropy.values.tolist()), np.array(men.timeEntropy.values.tolist()), labels, self.imgpath+"gtrend_time_entropy_ccdf.png", "Temp Entropy", 1, 0)
        ut.plot_facet_dist(self.people, 'gender', 'timeEntropy', self.imgpath+"gtrend_time_entropy.png")


        ##############################################################################
        # PLOT ENTROPY
        ##############################################################################
        # for entropy we need to remove the nan value. If we dont have data the result is nan
        limPeople = self.people[np.isfinite(self.people['entropy'])] #people[people.index not in inds]
        men = limPeople[limPeople.gender =="male"]
        women = limPeople[limPeople.gender =="female"]
        labels = ['female ('+str(len(women.index))+')', 'male ('+str(len(men.index))+')']
        data = [women.entropy.values, men.entropy.values]

        self.logfile.write("\n\n Mann Withney U Entropy:")
        U, p =  stats.mstats.mannwhitneyu(women.entropy.values, men.entropy.values)
        ut.write_mannwithneyu(U, p, women.entropy.values, men.entropy.values, self.logfile)
        self.make_boxplot(data, labels, "gtrend_region_entropy_box.png", "shannon entropy")
        self.plot_ccdf(np.array(women.entropy.values.tolist()), np.array(men.entropy.values.tolist()), labels, self.imgpath+"gtrend_entropy_ccdf.png", "Entropy", 0, 0)
        ut.plot_facet_dist(self.people, 'gender', 'entropy', self.imgpath+"gtrend_region_entropy.png")


        self.regression()
    def run(self):
        regionalEntropy = {}
        regionCount = {}
        timeEntropy = {}
        sumInterest = {}
        posInterest = {}


        ##############################################################################
        # PARSE GOOGLE TREND RESULT FILES
        ##############################################################################
        quota_error = 0
        monthlyFiles = 0
        weeklyFiles = 0

        for file in self.onlyfiles:
            startFromLine = -1
            startFromLineTime = -1
            weekly = False
            pos = file.find(".json")
            if (pos >= 0):
                filename = file[0:pos]

                linesCounter = 1
                end = False
                endTime = False


                with open(self.datapath+"/"+file) as f:
                    content = f.read()
                    #print content
                    data = self.parse_gtrend_json(content)

                    timeseries = {}

                print data
                print len(data)
                if len(data) > 3:
                    for ind in xrange(0, len(data), 2):
                        #print data[ind]
                        #print data[ind+1]
                        if (ind+1) < len(data):
                            timeseries[data[ind]] = data[ind+1]


                timeFrequs = map(int, timeseries.values())

                if len(timeFrequs) > 1:
                    sumInterest[filename] = np.sum(timeFrequs)
                    posInterest[filename] = np.count_nonzero(timeFrequs)

                    if posInterest[filename] > 0:
                        timeEntropy[filename] = stats.entropy(timeFrequs)
                    else:
                        timeEntropy[filename] = np.nan


        interestDF = pd.DataFrame.from_dict(sumInterest.items())
        interestDF.columns=["filename", "timeInterest"]

        timeEntropyDF = pd.DataFrame.from_dict(timeEntropy.items())
        timeEntropyDF.columns=["filename", "timeEntropy"]

        posInterestDF = pd.DataFrame.from_dict(posInterest.items())
        posInterestDF.columns=["filename", "timePosInterest"]


        #print "regionalEntropyDF"
        #print regionalEntropyDF.head(n=1)
        #print regionalEntropyDF.shape
        #print "regionCountDF"
        #print regionCountDF.head(n=1)
        #print regionCountDF.shape
        #print "self.people"
        #print self.people.head(n=1)
        #print self.people.shape

        # add the computed statistics to the people file
        self.people = self.people.merge(timeEntropyDF, right_on="filename", left_on="filename", how="inner")
        self.people = self.people.merge(interestDF, right_on="filename", left_on="filename", how="inner")
        self.people = self.people.merge(posInterestDF, right_on="filename", left_on="filename", how="inner")


        print "AFTER MERGING"
        print self.people.head(n=1)
        print self.people.shape





        men = self.people[self.people.gender =="male"]
        women = self.people[self.people.gender =="female"]

        labels = ['female ('+str(len(women.index))+')', 'male ('+str(len(men.index))+')']

        ##############################################################################
        # PLOTS TOTAL INTEREST
        ##############################################################################

        data = [women.timeInterest.values, men.timeInterest.values]
        self.logfile.write("\n \n Mann Withney U Temp Sum Interest:")
        U, p =  stats.mstats.mannwhitneyu(women.timeInterest.values, men.timeInterest.values)
        ut.write_mannwithneyu(U, p, women.timeInterest.values, men.timeInterest.values, self.logfile)
        self.make_boxplot(data, labels, self.imgpath+"gtrend_time_interest_box.png", "sum interest")
        self.plot_ccdf(np.array(women.timeInterest.values.tolist()), np.array(men.timeInterest.values.tolist()), labels, self.imgpath+"gtrend_time_interest_ccdf.png", "Sum Interest", 1, 0)
        ut.plot_facet_dist(self.people, 'gender', 'timeInterest', self.imgpath+"gtrend_time_interest.png")

        data = [women.timePosInterest.values, men.timePosInterest.values]
        self.logfile.write("\n\n Mann Withney U Temp Pos Interest:")
        U, p =  stats.mstats.mannwhitneyu(women.timePosInterest.values, men.timePosInterest.values)
        ut.write_mannwithneyu(U, p, women.timePosInterest.values, men.timePosInterest.values, self.logfile)
        self.make_boxplot(data, labels, self.imgpath+"gtrend_time_pos_interest_box.png", "num weeks with interest")
        self.plot_ccdf(np.array(women.timePosInterest.values.tolist()), np.array(men.timePosInterest.values.tolist()), labels, self.imgpath+"gtrend_time_pos_interest_ccdf.png", "Num weeks with interest", 1, 0)
        ut.plot_facet_dist(self.people, 'gender', 'timePosInterest', self.imgpath+"gtrend_time_pos_interest.png")


        ##############################################################################
        # PLOT Entropy Temp INTEREST
        ##############################################################################
        limPeople = self.people[np.isfinite(self.people['timeEntropy'])] #people[people.index not in inds]
        men = limPeople[limPeople.gender =="male"]
        women = limPeople[limPeople.gender =="female"]
        data = [women.timeEntropy.values, men.timeEntropy.values]
        self.logfile.write("\n\n Mann Withney U Time Entropy:")
        U, p =  stats.mstats.mannwhitneyu(women.timeEntropy.values, men.timeEntropy.values)
        ut.write_mannwithneyu(U, p, women.timeEntropy.values, men.timeEntropy.values, self.logfile)
        self.make_boxplot(data, labels, self.imgpath+"gtrend_time_entropy_box.png", "temporal entropy")
        self.plot_ccdf(np.array(women.timeEntropy.values.tolist()), np.array(men.timeEntropy.values.tolist()), labels, self.imgpath+"gtrend_time_entropy_ccdf.png", "Temp Entropy", 1, 0)
        ut.plot_facet_dist(self.people, 'gender', 'timeEntropy', self.imgpath+"gtrend_time_entropy.png")



        self.regression()