def PreprocessingFile(files): newfilename=(files.split('.txt.gz'))[0]+'.txt' newfile=open(newfilename,'w') for tweet in TweetFiles.iterateTweetsFromGzip(files): try: if tweet['corrdinate']!= None: tempLoc1='%d'% tweet['corrdinate'][0] tempLoc2='%d'% tweet['corrdinate'][1] #print 'aaaaaa' #print tempLoc1,tempLoc2 if int(tempLoc1)>=30 and int(tempLoc1)<50 and int(tempLoc2)<-75 and int(tempLoc2)>-125: if isinstance(tweet['text'], unicode): tweetlist=tweet['text'].split() for checkword in keyWords: if checkword in tweetlist: json.dump(tweet, newfile) else: tweetlist=re.findall(r'\w+', tweet['text']) #print tweetlist for checkword in keyWords: if checkword in tweetlist: print tweetlist s=json.dumps(tweet) newfile.write(s+"\n") except:pass newfile.close()
def DocWordFreqDict(path): #---------------------- use each tweet as a doc. here may not be appropriate #---------------------------------------------------------------- initialize dict2 = {} docNum = 0 for tweet in TweetFiles.iterateTweetsFromGzip(path): docNum += 1 dict2[docNum] = {} #------------------------------------------------------- print a.split() #-------------------------------------------------------- DocWordFreqDict(path1) for puretweet in re.findall(r'\w+', tweet['text']): #-------------------------- for puretweet in tweet['text'].split('\W+'): #---------------------------------------------print pure text tweets #------------------------------ if (dict2[docNum][puretweet]!=None): if(puretweet in dict2[docNum].keys()): dict2[docNum][puretweet] += 1 else: dict2[docNum][puretweet] = 1 return dict2
def DocidWordFreqTimeLocationDictFromDir(directory): #---------------------- use each tweet as a doc. here may not be appropriate #---------------------------------------------------------------- initialize collectionWordDict = {}#word:frequency dict2 = {} timeLocationStamps = {}# docID:"time+location" timeLocationStampToDocIdDict = {}#time+location:[DocIds] timeToLocationDict = {}#time:[locations] locationToTimeDict = {}#location:[times] docId = 0 listing = os.listdir(directory) for path in listing: print "current file is: " + path path = directory + path for tweet in TweetFiles.iterateTweetsFromGzip(path): try: docId += 1 dict2[docId] = {} #get time stamp dict2[docId]['docWordCount'] = 0 tempTime = [] tempTime=re.split(r' ',tweet['time']) dict2[docId]['time'] = {} dict2[docId]['time']['year'] = int(tempTime[5]) #dict2[docId]['time']['month']={}#seems this is useless year='%04d' % int(tempTime[5]) tempMonth = tempTime[1] dict2[docId]['time']['month'] = monthDict[tempMonth] month='%02d' % monthDict[tempMonth] dict2[docId]['time']['day'] = int(tempTime[2]) day='%02d' % int(tempTime[2]) dict2[docId]['time']['hour'] = int(re.split(r':', tempTime[3])[0]) hour='%02d' % int(re.split(r':', tempTime[3])[0]) dict2[docId]['time']['minute'] = int(re.split(r':', tempTime[3])[1]) if dict2[docId]['time']['minute'] < 10: minuteStamp = 0 elif dict2[docId]['time']['minute'] >= 10 and dict2[docId]['time']['minute'] < 20: minuteStamp = 1 elif dict2[docId]['time']['minute'] >= 20 and dict2[docId]['time']['minute'] < 30: minuteStamp = 2 elif dict2[docId]['time']['minute'] >= 30 and dict2[docId]['time']['minute'] < 40: minuteStamp = 3 elif dict2[docId]['time']['minute'] >= 40 and dict2[docId]['time']['minute'] < 50: minuteStamp = 4 else: minuteStamp = 5 # print dict2[docId]['time'] # get location stamp # if tweet['coordinates'] != None: # tempLoc1 = '%.2f' % tweet['coordinates']['coordinates'][0] # tempLoc2 = '%.2f' % tweet['coordinates']['coordinates'][1] if tweet['corrdinate']!= None: tempLoc1='%d'% tweet['corrdinate'][0] tempLoc2='%d'% tweet['corrdinate'][1] # elif tweet['place']['bounding_box'] != None: # tempLocList = [] # tempLocList = tweet['place']['bounding_box']['coordinates'][0] # #print tempLocList[0][0] # #print tempLocList # tempLoc1 = '%.2f' % ((tempLocList[0][0] + tempLocList[1][0] + tempLocList[2][0] + tempLocList[3][0]) / 4) # tempLoc2 = '%.2f' % ((tempLocList[0][1] + tempLocList[1][1] + tempLocList[2][1] + tempLocList[3][1]) / 4) else: print "no location info" tempLoc1 = '0' tempLoc2 = '0' dict2[docId]['location'] = [] dict2[docId]['location'].insert(0, tempLoc1) dict2[docId]['location'].insert(1, tempLoc2) timeStamp = year+month+day+hour+ str(minuteStamp) locationStamp = tempLoc1 + tempLoc2 timeLocationStamps[docId] = (timeStamp + ',' + locationStamp) #print timeLocationStamps #creat the other three dictionary for later use if timeLocationStamps[docId] not in timeLocationStampToDocIdDict.keys(): timeLocationStampToDocIdDict[timeLocationStamps[docId]] = [] timeLocationStampToDocIdDict[timeLocationStamps[docId]].append(docId) if timeStamp not in timeToLocationDict.keys(): timeToLocationDict[timeStamp] = [] timeToLocationDict[timeStamp].append(locationStamp) if locationStamp not in locationToTimeDict.keys(): locationToTimeDict[locationStamp] = [] locationToTimeDict[locationStamp].append(timeStamp) #get words dict2[docId]['words'] = {} #-------------------------------------------------------- DocWordFreqDict(path1) # if tweet['text']==None: # docId-=1 # if re.findall(r'\w+', tweet['text'])==None: # docId-=1 #for puretweet in tweet['text'].split(): # dict2[docId]['docWordCount'] += 1 # if(puretweet in dict2[docId]['words'].keys()): # dict2[docId]['words'][puretweet] += 1 # else: # dict2[docId]['words'][puretweet] = 1 # if(puretweet in collectionWordDict.keys()): # collectionWordDict[puretweet] += 1 # else: # collectionWordDict[puretweet] = 1 # for puretweet in re.findall(r'\w+', tweet['text']): #-------------------------- for puretweet in tweet['text'].split('\W+'): #---------------------------------------------print pure text tweets #------------------------------ if (dict2[docNum][puretweet]!=None): #puretweet=str(puretweet) dict2[docId]['docWordCount'] += 1 if(puretweet in dict2[docId]['words'].keys()): dict2[docId]['words'][puretweet] += 1 else: dict2[docId]['words'][puretweet] = 1 if(puretweet in collectionWordDict.keys()): collectionWordDict[puretweet] += 1 else: collectionWordDict[puretweet] = 1 except: pass # print collectionWordDict # print dict2 return (collectionWordDict, dict2, timeLocationStamps, timeLocationStampToDocIdDict, timeToLocationDict, locationToTimeDict)
def DocidWordFreqTimeLocationDictFromDir(directory, keywords): #---------------------- use each tweet as a doc. here may not be appropriate #---------------------------------------------------------------- initialize collectionWordDict = {}#word:frequency dict2 = {} timeStamps = {}# docID:"time+location" timeStampToDocIdDict = {}#time+location:[DocIds] #timeToLocationDict = {}#time:[locations] #locationToTimeDict = {}#location:[times] docId = 0 listing = os.listdir(directory) for path in listing: print "current file is: " + path path = directory + path for tweet in TweetFiles.iterateTweetsFromGzip(path): if 1: #try: docId = tweet['id']; dict2[docId] = {}; #get words dict2[docId]['words'] = {} dict2[docId]['docWordCount'] = 0 simi_flag = 0; for token in keywords: #for puretweet in re.findall(r'\w+', tweet['text']): #-------------------------- for puretweet in tweet['text'].split('\W+'): #---------------------------------------------print pure text tweets #------------------------------ if (dict2[docNum][puretweet]!=None): #puretweet=str(puretweet) if similarityCal(token, tweet['text']) <= 0: continue simi_flag = 1; dict2[docId]['docWordCount'] += 1 if(token in dict2[docId]['words'].keys()): dict2[docId]['words'][token] += 1 else: dict2[docId]['words'][token] = 1 if(token in collectionWordDict.keys()): collectionWordDict[token] += 1 else: collectionWordDict[token] = 1 if simi_flag <= 0: del dict2[docId] continue; #docId += 1 #dict2[docId] = {} #get time stamp tempTime = [] tempTime=re.split(r' ',tweet['time']) dict2[docId]['time'] = {} dict2[docId]['time']['year'] = int(tempTime[5]) #dict2[docId]['time']['month']={}#seems this is useless year='%04d' % int(tempTime[5]) tempMonth = tempTime[1] dict2[docId]['time']['month'] = monthDict[tempMonth] month='%02d' % monthDict[tempMonth] dict2[docId]['time']['day'] = int(tempTime[2]) day='%02d' % int(tempTime[2]) dict2[docId]['time']['hour'] = int(re.split(r':', tempTime[3])[0]) hour='%02d' % int(re.split(r':', tempTime[3])[0]) dict2[docId]['time']['minute'] = int(re.split(r':', tempTime[3])[1]) if dict2[docId]['time']['minute'] < 30: minuteStamp = 0 else: minuteStamp = 1 timeStamp = year+month+day #locationStamp = tempLoc1 + tempLoc2 timeStamps[docId] = (timeStamp) #print timeLocationStamps #creat the other three dictionary for later use if timeStamps[docId] not in timeStampToDocIdDict.keys(): timeStampToDocIdDict[timeStamps[docId]] = [] timeStampToDocIdDict[timeStamps[docId]].append(docId) #except: # print 'exception!' # pass # print collectionWordDict # print dict2 # outfile = file('dictData.txt', 'w'); # json.dump(collectionWordDict, outfile); # outfile.write('\n'); # json.dump(dict2, outfile); # outfile.write('\n'); # json.dump(timeStamps, outfile); # outfile.write('\n'); # json.dump(timeStampToDocIdDict, outfile); # outfile.write('\n'); # outfile.close(); return (collectionWordDict, dict2, timeStamps, timeStampToDocIdDict)
def UserIdNameDict(path): dict1 = {} for tweet in TweetFiles.iterateTweetsFromGzip(path): dict1[tweet['user']['id']] = tweet['user']['name'] return dict1
def ReadFile(path): for tweet in TweetFiles.iterateTweetsFromGzip(path): print tweet