Ejemplo n.º 1
0
def gettweets(howmanytweets, dataoutputfile, logfile):
    consumer_key = 'bqb2614rUsb9Ts7DhtyTaPuWI'
    consumer_secret = 'Q9UMklSd41gMzUKIMzt5CzZQZNOS6E5ndHNGk58pUxBShvGO2r'
    access_token = '925969087997542400-mqhVvH56uE8pA5sqprq6vCh7XVB00Wj'
    access_secret = 'a9xyKdciNSoZdg6AapOA6774AVEYgnpId5eelD9DvXwRS'

    auth = OAuthHandler(consumer_key, consumer_secret)
    auth.set_access_token(access_token, access_secret)

    api = tweepy.API(auth)

    seqkeyfile = open('tweetsequencekey.txt')
    tweetsequencekey = int(seqkeyfile.read())
    seqkeyfile.close()

    seqkeyfile = open('tweetsequencekey.txt', 'w')
    tweetsequencenew = tweetsequencekey + howmanytweets + 1
    seqkeyfile.write('%d\n' % (tweetsequencenew))
    seqkeyfile.close()

    count = 0
    for result in limit_handled(
            tweepy.Cursor(api.search, q=" mom ").items(), logfile):

        jsonversion = json.dumps(result._json)
        #        print('\nJSONVERSION')
        #        print(jsonversion)

        #        print('\nJSONDUMPS')
        #        print(json.dumps(jsonversion, sort_keys=True, indent=4))

        jsonstring = StringIO(jsonversion)
        thisdict = json.load(jsonstring)

        #        print('\nTHISDICT')
        #        outstring = '%8d %s' % (count, thisdict)
        #        outputDAB.write(outstring + '\n')

        outstring = 'KEY VALUE PAIRS COUNTER %5d' % (count)
        printoutput(outstring, logfile)
        for key, value in sorted(thisdict.items()):
            outstring = '%8d XXZZXX %s XXZZXX %s ZZXXZZ' % (tweetsequencekey,
                                                            key, value)
            dataoutputfile.write(outstring + '\n')
#        outstring = 'ZZXXZZ'
#        outputDAB.write(outstring + '\n')
#        sys.exit()

#        outputJSON.write(str(result))
#        outputJSON.write("\n")
#        outputText.write(result.text)
#        outputText.write("\n")
#        outputUserID.write(str(result.user.id))
#        outputUserID.write("\n")
        count += 1
        tweetsequencekey += 1

        # This is a kluge in here to limit execution so we can test code.
        # In a real application this might go quite a while before exiting.
        if count >= howmanytweets: break
Ejemplo n.º 2
0
def limit_handled(cursor, logfile):
    while True:
        try:
            yield cursor.next()
        except tweepy.RateLimitError:
            outstring = "limit reached"
            printoutput(outstring, logfile)
        #    time.sleep(15 * 60)
        except tweepy.TweepError:
            outstring = "limit reached"
            printoutput(outstring, logfile)
Ejemplo n.º 3
0
 def printsparsematrix(self, which):
     """ This is the docstring.
     """
     if which == 'TERM':
         for key, value in sorted(self.sparsematrixbyterm.items()):
             sss = ''
             for item in value:
                 sss += '   {:5s}:{:6.4f}'.format(item[0], item[3])
             printoutput('MATRIXTERM {:20s} {:s}'.format(key, sss), OUTFILE)
     elif which == 'DOC':
         for key, value in sorted(self.sparsematrixbydoc.items()):
             sss = ''
             for item in value:
                 sss += '   {:5s}:{:6.3f}'.format(item[0], item[3])
             printoutput('MATRIXDOC {:6s} {:s}'.format(key, sss), OUTFILE)
Ejemplo n.º 4
0
def main(pathtodata):
    """ This is the docstring.
    """

    ########################################################################
    ## Measure process and wall clock times.
    dabtimer = DABTimer()
    logstring = dabtimer.timecall('BEGINNING')
    printoutput(logstring, LOGFILE)

    theglobals = Globals()

    theglobals.filenames = glob(pathtodata + '/*')

    ########################################################################
    ## Compute the term frequencies in each file.
    computetermfreqs(theglobals)

    ########################################################################
    ## Filter from all the terms to only the most frequent.
    for onedoc in theglobals.docs.values():
        onedoc.filterformostfrequent(10)

    ########################################################################
    ## Compute the tfidf and the sparse matrices along the way.
    computetfidf(theglobals)

    ########################################################################
    ## Print the results.
    theglobals.printresults('SHORT')

    ########################################################################
    ## Now compute and print the cosine comparisons.
    cosinedict = computecosinecomparison('DOCDOC',
                                         theglobals.sparsematrixbydoc)
    for key, value in sorted(cosinedict.items()):
        sss = 'DOCDOC {:7s} '.format(key)
        for item in sorted(value):
            if item[1] >= 0.50:
                sss += ' ({:6s} {:8.4f})'.format(item[0], item[1])
        printoutput(sss, OUTFILE)

    cosinedict = computecosinecomparison('TERMTERM',
                                         theglobals.sparsematrixbyterm)
    for key, value in sorted(cosinedict.items()):
        sss = 'TERMTERM {:15s} '.format(key)
        for item in sorted(value):
            if item[1] >= 0.90:
                sss += ' ({:6s} {:8.4f})'.format(item[0], item[1])
        printoutput(sss, OUTFILE)

    logstring = dabtimer.timecall('ENDING')
    printoutput(logstring, LOGFILE)
Ejemplo n.º 5
0
def computecosinecomparison(label, sparsematrix):
    """ This is the docstring.
    """
    doclength = defaultdict(float)
    for key, value in sparsematrix.items():
        doclength[key] = computevectorlength(value)

    for key, value in sorted(doclength.items()):
        sss = '{:s} LENGTH {:20s} {:10.4f}'.format(label, key, value)
        printoutput(sss, OUTFILE)

    cosinedict = defaultdict(list)
    for key1, value1 in sparsematrix.items():
        for key2, value2 in sparsematrix.items():
            # We don't want A versus B and also B versus A,
            # and we don't want A versus A.
            if key2 <= key1:
                continue
            product = 0.0
            sss = '{:s} ONE {:s} {:s}'.format(label, key1, str(value1))
            printoutput(sss, OUTFILE)
            sss = '{:s} TWO {:s} {:s}'.format(label, key2, str(value2))
            printoutput(sss, OUTFILE)
            for item1 in value1:
                #                term1 = item1[0]
                #                tfidf1 = item1[1]
                for item2 in value2:
                    #                    term2 = item2[0]
                    #                    tfidf2 = item2[1]
                    #                    if term1 == term2:
                    if item1[0] == item2[0]:
                        #                        addin = tfidf1 * tfidf2
                        addin = item1[1] * item2[1]
                        product += addin
            product = product / doclength[key1]
            product = product / doclength[key2]
            printoutput(
                '{:s} THR {:s} {:s} {:10.4f}\n'.format(label, key1, key2,
                                                       product), OUTFILE)
            cosinedict[key1].append([key2, product])
            cosinedict[key2].append([key1, product])

    return cosinedict
Ejemplo n.º 6
0
def main(howmanytweets, dataoutputfilename, logfilename):

    dataoutputfile = open(dataoutputfilename, 'w')
    logfile = open(logfilename, 'w')

    outstring = 'MAIN: GET %d TWEETS' % (howmanytweets)
    printoutput(outstring, logfile)

    outstring = "MAIN: WRITE TWEETS TO FILE '%s'" % (dataoutputfilename)
    printoutput(outstring, logfile)

    outstring = "MAIN: WRITE LOG TO FILE    '%s'" % (logfilename)
    printoutput(outstring, logfile)

    gettweets(howmanytweets, dataoutputfile, logfile)
    dataoutputfile.close()
Ejemplo n.º 7
0
    def printresults(self, which):
        """ This is the docstring.
        """
        sss = 'LEGEND'
        sss += '\nColumn 0: Filename'
        sss += '\nColumn 1: Raw freq of the term in the doc'
        sss += '\nColumn 2: Total word count for the doc'
        sss += '\nColumn 3: (Raw freq)/(Total word count)'
        sss += '\nColumn 4: Number of unique terms in the doc'
        sss += '\nColumn 5: Number of unique terms in the collection'
        sss += '\nColumn 6: Number of docs in which the term appears'
        sss += '\nColumn 7: idf for this term'
        sss += '\nColumn 8: tf-idf for this term'
        sss += '\nColumn 9: The term'
        printoutput(sss, OUTFILE)

        for filename, onedoc in sorted(self.docs.items()):
            justfilename = filename.split('/')[-1]
            if which == 'ALL':
                thetermlist = onedoc.allterms
            elif which == 'SHORT':
                thetermlist = onedoc.topterms
            else:
                sss = 'ERROR IN PRINTRESULTS {:s}'.format(which)
                printoutput(sss, OUTFILE)
                sys.exit()

            for wordandpos, thisterm in sorted(thetermlist.items()):
                thisterm = onedoc.allterms[wordandpos]

                sss = '{:s} '.format(justfilename)
                sss += '{:7d} '.format(thisterm.rawfreq)
                sss += '{:7d} '.format(onedoc.totalwordcount)
                sss += '{:10.4f} '.format(thisterm.tftf)
                sss += '{:7d} '.format(len(onedoc.wordandposset))
                sss += '{:7d} '.format(len(self.overallwordandposset))
                sss += '{:7d} '.format(len(self.docset[wordandpos]))
                sss += '{:10.4f} '.format(thisterm.idf)
                sss += '{:10.4f} '.format(thisterm.tfidf)
                sss += '{:s} '.format(wordandpos)
                printoutput(sss, OUTFILE)
Ejemplo n.º 8
0
def dumptweet(label, thetweet, logfile):
    outstring = '\nDUMPONETWEET %s' % (label)
    printoutput(outstring, logfile)
    for key, value in thetweet.items():
        outstring = 'DUMPTWEET    %s %8s %s' % (label, key, value)
        printoutput(outstring, logfile)