def PreprocessingFile(files): newfilename=(files.split('.txt.gz'))[0]+'.txt' newfile=open(newfilename,'w') for tweet in TweetFiles.iterateTweetsFromGzip(files): try: if tweet['corrdinate']!= None: tempLoc1='%d'% tweet['corrdinate'][0] tempLoc2='%d'% tweet['corrdinate'][1] #print 'aaaaaa' #print tempLoc1,tempLoc2 if int(tempLoc1)>=30 and int(tempLoc1)<50 and int(tempLoc2)<-75 and int(tempLoc2)>-125: if isinstance(tweet['text'], unicode): tweetlist=tweet['text'].split() for checkword in keyWords: if checkword in tweetlist: json.dump(tweet, newfile) else: tweetlist=re.findall(r'\w+', tweet['text']) #print tweetlist for checkword in keyWords: if checkword in tweetlist: print tweetlist s=json.dumps(tweet) newfile.write(s+"\n") except:pass newfile.close()
def PreprocessingFile(files): filename=(files.split('reduced_tag_newyork/'))[1] tagdict={} # print filename # newfilename= '/home/wei/new_geo_hashtag_data/'+filename +'.txt' # newfile=open(newfilename,'w') for tweet in TweetFiles.iterateTweetsFromFile(files): # if tweet['h']: # print tweet['h'] # temploc=[] # if 'geo' in tweet.keys(): # temploc=tweet['geo'] # loc=CheckLocation(tweet['geo'], 1, 30,50, -125, -75) # elif 'bb' in tweet.keys(): # temploc=tweet['bb'] #print temploc # if temploc[0]>24.52 and temploc[0]<49.38 and temploc[1]<-66.95 and temploc[1]>-124.77: # loc=CheckLocation(tweet['bb'], 1, 30,50, -125, -75) for tag in tweet['h']: if tag not in tagdict.keys(): tagdict[tag]=1 else:tagdict[tag]+=1 # s=json.dump(tweet, newfile) # newfile.write("\n") tagdictfile=open('./'+filename+'dict','w') print>>tagdictfile,tagdict return tagdict
def BuildDataStructure(directory): CollectionOfHashtags={} LocationDict={} LocationMap={} locationNum=0# the number of loactions # listing = os.listdir(directory) # for file in listing: # print "current file reading is: " + file # path=directory+file n=0 for tweet in TweetFiles.iterateTweetsFromFile(directory): # print tweet n=n+1 if n%1000==0: print n if tweet['h']: # print tweet['h'] if 'geo' in tweet.keys(): # print tweet['geo'] loc=CheckLocation(tweet['geo'], 0.01,-74.25,-73.69,40.48,40.96) elif 'bb' in tweet.keys(): # print tweet['bb'] loc=CheckLocation(tweet['bb'], 0.01,-74.25,-73.69,40.48,40.96) #else: pass # print loc loc=str(loc) uid=tweet['user']['id'] if loc not in LocationDict.keys(): LocationDict[loc]={} LocationDict[loc]['uids']=[] if uid not in LocationDict[loc]['uids']: LocationDict[loc]['uids'].append(uid) # lockey=GetKeyFromValue(LocationMap,loc) for tag in tweet['h']: if tag in CollectionOfHashtags.keys(): CollectionOfHashtags[tag]['totalCount']+=1 if uid not in CollectionOfHashtags[tag]['uids']: CollectionOfHashtags[tag]['uids'].append(uid) else: CollectionOfHashtags[tag]={} CollectionOfHashtags[tag]['uids']=[] CollectionOfHashtags[tag]['totalCount']=1 CollectionOfHashtags[tag]['fot']=tweet['t'] CollectionOfHashtags[tag]['uids'].append(uid) # if loc not in LocationDict.keys(): # LocationDict[loc]={} # LocationDict[loc]['uids']=[] CollectionOfHashtags[tag]['fol']=loc # print tag # print LocationDict[loc].keys() if tag not in LocationDict[loc].keys(): LocationDict[loc][tag]={} LocationDict[loc][tag]['localcount']=1 LocationDict[loc][tag]['fot']=tweet['t'] else: LocationDict[loc][tag]['localcount']+=1 # if lockey in LocationDict.keys(): return (CollectionOfHashtags,LocationDict)
def BuildDataStructure(directory): CollectionOfHashtags = {} LocationDict = {} LocationMap = {} locationNum = 0 # the number of loactions # listing = os.listdir(directory) # for file in listing: # print "current file reading is: " + file # path=directory+file n = 0 for tweet in TweetFiles.iterateTweetsFromFile(directory): # print tweet n = n + 1 if n % 1000 == 0: print n if tweet["h"]: # print tweet['h'] if "geo" in tweet.keys(): loc = CheckLocation(tweet["geo"], 1, -124.77, -66.95, 22.52, 49.38) elif "bb" in tweet.keys(): loc = CheckLocation(tweet["bb"], 1, -124.77, -66.95, 22.52, 49.38) # else: pass for tag in tweet["h"]: if tag in CollectionOfHashtags.keys(): CollectionOfHashtags[tag]["totalCount"] += 1 else: CollectionOfHashtags[tag] = {} CollectionOfHashtags[tag]["totalCount"] = 1 CollectionOfHashtags[tag]["fot"] = tweet["t"] if loc not in LocationMap.values(): # convert location here locationNum += 1 LocationMap[locationNum] = loc LocationDict[locationNum] = {} lockey = GetKeyFromValue(LocationMap, loc) CollectionOfHashtags[tag]["fol"] = lockey # if lockey in LocationDict.keys(): for tag in tweet["h"]: # print tag if tag not in LocationDict[lockey].keys(): LocationDict[lockey][tag] = {} LocationDict[lockey][tag]["localcount"] = 1 LocationDict[lockey][tag]["fot"] = tweet["t"] else: LocationDict[lockey][tag]["localcount"] += 1 return (CollectionOfHashtags, LocationDict, LocationMap)
def BuildDataStructure(directory): CollectionOfHashtags = {} LocationDict = {} LocationMap = {} locationNum = 0 # the number of loactions listing = os.listdir(directory) for file in listing: print "current file reading is: " + file path = directory + file for tweet in TweetFiles.iterateTweetsFromFile(path): # print tweet if tweet['h']: # print tweet['h'] if 'geo' in tweet.keys(): loc = CheckLocation(tweet['geo'], 0.01, -125, -75, 30, 50) elif 'bb' in tweet.keys(): loc = CheckLocation(tweet['bb'], 0.01, -125, -75, 30, 50) #else: pass for tag in tweet['h']: if tag in CollectionOfHashtags.keys(): CollectionOfHashtags[tag]['totalCount'] += 1 else: CollectionOfHashtags[tag] = {} CollectionOfHashtags[tag]['totalCount'] = 1 CollectionOfHashtags[tag]['fot'] = tweet['t'] if loc not in LocationMap.values( ): #convert location here locationNum += 1 LocationMap[locationNum] = loc LocationDict[locationNum] = {} lockey = GetKeyFromValue(LocationMap, loc) CollectionOfHashtags[tag]['fol'] = lockey # if lockey in LocationDict.keys(): # for tag in tweet['h']: # print tag if tag not in LocationDict[lockey].keys(): LocationDict[lockey][tag] = {} LocationDict[lockey][tag]['localcount'] = 1 LocationDict[lockey][tag]['fot'] = tweet['t'] else: LocationDict[lockey][tag]['localcount'] += 1 return (CollectionOfHashtags, LocationDict, LocationMap)
def DocWordFreqDict(path): #---------------------- use each tweet as a doc. here may not be appropriate #---------------------------------------------------------------- initialize dict2 = {} docNum = 0 for tweet in TweetFiles.iterateTweetsFromGzip(path): docNum += 1 dict2[docNum] = {} #------------------------------------------------------- print a.split() #-------------------------------------------------------- DocWordFreqDict(path1) for puretweet in re.findall(r'\w+', tweet['text']): #-------------------------- for puretweet in tweet['text'].split('\W+'): #---------------------------------------------print pure text tweets #------------------------------ if (dict2[docNum][puretweet]!=None): if(puretweet in dict2[docNum].keys()): dict2[docNum][puretweet] += 1 else: dict2[docNum][puretweet] = 1 return dict2
def PreprocessingFile(files): filename=(files.split('reduced_tag_newyork/'))[1] # print filename newfilename= '/home/wei/ideamap/reduced_tag_newyork/'+filename+'2block' newfile=open(newfilename,'w') for tweet in TweetFiles.iterateTweetsFromFile(files): # if tweet['h']: # print tweet['h'] temploc=[] if 'geo' in tweet.keys(): temploc=tweet['geo'] # loc=CheckLocation(tweet['geo'], 1, 30,50, -125, -75) elif 'bb' in tweet.keys(): temploc=tweet['bb'] # print temploc if BoundingNewYork(temploc)==1: # loc=CheckLocation(tweet['bb'], 1, 30,50, -125, -75) s=json.dump(tweet, newfile) # print tweet newfile.write("\n")
def PreprocessingFile(files): filename=(files.split('reduced_geo/'))[1] # print filename newfilename= '/home/wei/geo_hashtag_data/'+filename +'.txt1' newfile=open(newfilename,'w') for tweet in TweetFiles.iterateTweetsFromFile(files): if tweet['h']: # print tweet['h'] temploc=[] if 'geo' in tweet.keys(): temploc=tweet['geo'] # loc=CheckLocation(tweet['geo'], 1, 30,50, -125, -75) elif 'bb' in tweet.keys(): temploc=tweet['bb'] #print temploc # if temploc[0]>24 and temploc[0]<49.38 and temploc[1]<-66.95 and temploc[1]>-124.77: if BoundingUS(temploc)==1: # loc=CheckLocation(tweet['bb'], 1, 30,50, -125, -75) s=json.dump(tweet, newfile) newfile.write("\n")
def PreprocessingFile(files,tagdict): filename=(files.split('reduced_geo/'))[1] # print filename newfilename= '/home/wei/new_geo_hashtag_data/'+filename +'.txt' newfile=open(newfilename,'w') for tweet in TweetFiles.iterateTweetsFromFile(files): if tweet['h']: # print tweet['h'] temploc=[] if 'geo' in tweet.keys(): temploc=tweet['geo'] # loc=CheckLocation(tweet['geo'], 1, 30,50, -125, -75) elif 'bb' in tweet.keys(): temploc=tweet['bb'] #print temploc if temploc[0]>24.52 and temploc[0]<49.38 and temploc[1]<-66.95 and temploc[1]>-124.77: # loc=CheckLocation(tweet['bb'], 1, 30,50, -125, -75) for tag in tweet['h']: if tag not in tagdict.keys(): tagdict[tag]=1 else:tagdict[tag]+=1 s=json.dump(tweet, newfile) newfile.write("\n")
from twitter import TweetFiles import os import json dir="/home/wei/new_geo_hashtag_data/2011_3.txt" dictpath= "/home/wei/ideamap/2011_3dict" path="/home/wei/new_geo_hashtag_data/reduced100000_2011_3" f=open(dictpath,'r') path1="" f1=open(path,'w') tagdict=f.readline() tagdict=eval(tagdict) tagdict1={} for tag in tagdict.keys(): # print tag,tagdict[tag] if tagdict[tag]>=100000: tagdict1[tag]=tagdict[tag] print tag,tagdict[tag] for tweet in TweetFiles.iterateTweetsFromFile(dir): for tag in tweet['h']: if tag in tagdict1.keys(): json.dump(tweet,f1) f1.write('\n') break
def DocidWordFreqTimeLocationDictFromDir(directory): #---------------------- use each tweet as a doc. here may not be appropriate #---------------------------------------------------------------- initialize collectionWordDict = {}#word:frequency dict2 = {} timeLocationStamps = {}# docID:"time+location" timeLocationStampToDocIdDict = {}#time+location:[DocIds] timeToLocationDict = {}#time:[locations] locationToTimeDict = {}#location:[times] docId = 0 listing = os.listdir(directory) for path in listing: print "current file is: " + path path = directory + path for tweet in TweetFiles.iterateTweetsFromGzip(path): try: docId += 1 dict2[docId] = {} #get time stamp dict2[docId]['docWordCount'] = 0 tempTime = [] tempTime=re.split(r' ',tweet['time']) dict2[docId]['time'] = {} dict2[docId]['time']['year'] = int(tempTime[5]) #dict2[docId]['time']['month']={}#seems this is useless year='%04d' % int(tempTime[5]) tempMonth = tempTime[1] dict2[docId]['time']['month'] = monthDict[tempMonth] month='%02d' % monthDict[tempMonth] dict2[docId]['time']['day'] = int(tempTime[2]) day='%02d' % int(tempTime[2]) dict2[docId]['time']['hour'] = int(re.split(r':', tempTime[3])[0]) hour='%02d' % int(re.split(r':', tempTime[3])[0]) dict2[docId]['time']['minute'] = int(re.split(r':', tempTime[3])[1]) if dict2[docId]['time']['minute'] < 10: minuteStamp = 0 elif dict2[docId]['time']['minute'] >= 10 and dict2[docId]['time']['minute'] < 20: minuteStamp = 1 elif dict2[docId]['time']['minute'] >= 20 and dict2[docId]['time']['minute'] < 30: minuteStamp = 2 elif dict2[docId]['time']['minute'] >= 30 and dict2[docId]['time']['minute'] < 40: minuteStamp = 3 elif dict2[docId]['time']['minute'] >= 40 and dict2[docId]['time']['minute'] < 50: minuteStamp = 4 else: minuteStamp = 5 # print dict2[docId]['time'] # get location stamp # if tweet['coordinates'] != None: # tempLoc1 = '%.2f' % tweet['coordinates']['coordinates'][0] # tempLoc2 = '%.2f' % tweet['coordinates']['coordinates'][1] if tweet['corrdinate']!= None: tempLoc1='%d'% tweet['corrdinate'][0] tempLoc2='%d'% tweet['corrdinate'][1] # elif tweet['place']['bounding_box'] != None: # tempLocList = [] # tempLocList = tweet['place']['bounding_box']['coordinates'][0] # #print tempLocList[0][0] # #print tempLocList # tempLoc1 = '%.2f' % ((tempLocList[0][0] + tempLocList[1][0] + tempLocList[2][0] + tempLocList[3][0]) / 4) # tempLoc2 = '%.2f' % ((tempLocList[0][1] + tempLocList[1][1] + tempLocList[2][1] + tempLocList[3][1]) / 4) else: print "no location info" tempLoc1 = '0' tempLoc2 = '0' dict2[docId]['location'] = [] dict2[docId]['location'].insert(0, tempLoc1) dict2[docId]['location'].insert(1, tempLoc2) timeStamp = year+month+day+hour+ str(minuteStamp) locationStamp = tempLoc1 + tempLoc2 timeLocationStamps[docId] = (timeStamp + ',' + locationStamp) #print timeLocationStamps #creat the other three dictionary for later use if timeLocationStamps[docId] not in timeLocationStampToDocIdDict.keys(): timeLocationStampToDocIdDict[timeLocationStamps[docId]] = [] timeLocationStampToDocIdDict[timeLocationStamps[docId]].append(docId) if timeStamp not in timeToLocationDict.keys(): timeToLocationDict[timeStamp] = [] timeToLocationDict[timeStamp].append(locationStamp) if locationStamp not in locationToTimeDict.keys(): locationToTimeDict[locationStamp] = [] locationToTimeDict[locationStamp].append(timeStamp) #get words dict2[docId]['words'] = {} #-------------------------------------------------------- DocWordFreqDict(path1) # if tweet['text']==None: # docId-=1 # if re.findall(r'\w+', tweet['text'])==None: # docId-=1 #for puretweet in tweet['text'].split(): # dict2[docId]['docWordCount'] += 1 # if(puretweet in dict2[docId]['words'].keys()): # dict2[docId]['words'][puretweet] += 1 # else: # dict2[docId]['words'][puretweet] = 1 # if(puretweet in collectionWordDict.keys()): # collectionWordDict[puretweet] += 1 # else: # collectionWordDict[puretweet] = 1 # for puretweet in re.findall(r'\w+', tweet['text']): #-------------------------- for puretweet in tweet['text'].split('\W+'): #---------------------------------------------print pure text tweets #------------------------------ if (dict2[docNum][puretweet]!=None): #puretweet=str(puretweet) dict2[docId]['docWordCount'] += 1 if(puretweet in dict2[docId]['words'].keys()): dict2[docId]['words'][puretweet] += 1 else: dict2[docId]['words'][puretweet] = 1 if(puretweet in collectionWordDict.keys()): collectionWordDict[puretweet] += 1 else: collectionWordDict[puretweet] = 1 except: pass # print collectionWordDict # print dict2 return (collectionWordDict, dict2, timeLocationStamps, timeLocationStampToDocIdDict, timeToLocationDict, locationToTimeDict)
def DocidWordFreqTimeLocationDictFromDir(directory, keywords): #---------------------- use each tweet as a doc. here may not be appropriate #---------------------------------------------------------------- initialize collectionWordDict = {}#word:frequency dict2 = {} timeStamps = {}# docID:"time+location" timeStampToDocIdDict = {}#time+location:[DocIds] #timeToLocationDict = {}#time:[locations] #locationToTimeDict = {}#location:[times] docId = 0 listing = os.listdir(directory) for path in listing: print "current file is: " + path path = directory + path for tweet in TweetFiles.iterateTweetsFromGzip(path): if 1: #try: docId = tweet['id']; dict2[docId] = {}; #get words dict2[docId]['words'] = {} dict2[docId]['docWordCount'] = 0 simi_flag = 0; for token in keywords: #for puretweet in re.findall(r'\w+', tweet['text']): #-------------------------- for puretweet in tweet['text'].split('\W+'): #---------------------------------------------print pure text tweets #------------------------------ if (dict2[docNum][puretweet]!=None): #puretweet=str(puretweet) if similarityCal(token, tweet['text']) <= 0: continue simi_flag = 1; dict2[docId]['docWordCount'] += 1 if(token in dict2[docId]['words'].keys()): dict2[docId]['words'][token] += 1 else: dict2[docId]['words'][token] = 1 if(token in collectionWordDict.keys()): collectionWordDict[token] += 1 else: collectionWordDict[token] = 1 if simi_flag <= 0: del dict2[docId] continue; #docId += 1 #dict2[docId] = {} #get time stamp tempTime = [] tempTime=re.split(r' ',tweet['time']) dict2[docId]['time'] = {} dict2[docId]['time']['year'] = int(tempTime[5]) #dict2[docId]['time']['month']={}#seems this is useless year='%04d' % int(tempTime[5]) tempMonth = tempTime[1] dict2[docId]['time']['month'] = monthDict[tempMonth] month='%02d' % monthDict[tempMonth] dict2[docId]['time']['day'] = int(tempTime[2]) day='%02d' % int(tempTime[2]) dict2[docId]['time']['hour'] = int(re.split(r':', tempTime[3])[0]) hour='%02d' % int(re.split(r':', tempTime[3])[0]) dict2[docId]['time']['minute'] = int(re.split(r':', tempTime[3])[1]) if dict2[docId]['time']['minute'] < 30: minuteStamp = 0 else: minuteStamp = 1 timeStamp = year+month+day #locationStamp = tempLoc1 + tempLoc2 timeStamps[docId] = (timeStamp) #print timeLocationStamps #creat the other three dictionary for later use if timeStamps[docId] not in timeStampToDocIdDict.keys(): timeStampToDocIdDict[timeStamps[docId]] = [] timeStampToDocIdDict[timeStamps[docId]].append(docId) #except: # print 'exception!' # pass # print collectionWordDict # print dict2 # outfile = file('dictData.txt', 'w'); # json.dump(collectionWordDict, outfile); # outfile.write('\n'); # json.dump(dict2, outfile); # outfile.write('\n'); # json.dump(timeStamps, outfile); # outfile.write('\n'); # json.dump(timeStampToDocIdDict, outfile); # outfile.write('\n'); # outfile.close(); return (collectionWordDict, dict2, timeStamps, timeStampToDocIdDict)
def UserIdNameDict(path): dict1 = {} for tweet in TweetFiles.iterateTweetsFromGzip(path): dict1[tweet['user']['id']] = tweet['user']['name'] return dict1
def ReadFile(path): for tweet in TweetFiles.iterateTweetsFromGzip(path): print tweet