def saveTags(self, strTags, tag2code): codeList = [] tagStrList = strTags.split(" ") for tstr in tagStrList: tstr = clean_line(tstr) # check if the tag exist if tag2code.has_key(tstr): codeList.append(tag2code[tstr]) return codeList
def saveTags(self, strTags, tag2code): codeList = [] tagStrList = strTags.split(" ") for tstr in tagStrList: tstr = clean_line( tstr ) # check if the tag exist if tag2code.has_key( tstr ): codeList.append( tag2code[tstr] ) return codeList
def saveOwnerInfo(self, ownStr, tag2code): ''' Owner location: ['14678786@N00', 'milwaukee, United States'] ''' hometown = [] hometownTags = [] blk = ownStr.split("|") if len(blk) > 1: for hw in ownStr.split("|")[1].split(","): hw_clear = clean_line(hw) hometown.append(hw_clear) if tag2code.has_key(hw_clear): hometownTags.append(tag2code[hw_clear]) return ownStr.split("|")[0], hometown, hometownTags
def saveOwnerInfo(self, ownStr, tag2code): ''' Owner location: ['14678786@N00', 'milwaukee, United States'] ''' hometown = [] hometownTags = [] blk = ownStr.split("|") if len(blk) > 1: for hw in ownStr.split("|")[1].split(","): hw_clear = clean_line(hw) hometown.append( hw_clear ) if tag2code.has_key( hw_clear ): hometownTags.append( tag2code[hw_clear] ) return ownStr.split("|")[0], hometown, hometownTags
# 3: tags # 4: timestamp # 5: timestamp # example of output line: #4876969322 Boats on The Thames at Penton Hook Lock http://www.flickr.com/photos/37413900@N04/4876969322/ thames pleasure boats penton hook lock gates laleham staines surrey barge 51.414745|-0.500274|16|Staines|Surrey|Angleterre|Royaume Uni|16 37413900@N04|Greater London, England United Kingdom # 0: imageId # 1: Title # 2: url # 3: tags # 4: coordinates # 5: userId|hometown totLines = 3185258.0 lines = 0 print >> sys.stderr, "*> Convert Images MetaData", for line in sys.stdin: lines += 1 blk = line.strip().split(" : ") usrId, imgId = blk[0].split( "/" ) title = "" url = blk[1] lat, lon, acc = getGeoDataFromImageMeta( blk[2] ) coord = lat +"|"+ lon +"|"+ acc tags = clean_line( blk[3] ) print >> sys.stdout, "%s\t%s\t%s\t%s\t%s\t%s" % (imgId.encode('utf-8'), title, url.encode('utf-8'), tags.encode('utf-8'), coord.encode('utf-8'), usrId.encode('utf-8')) if lines % 200000: print >> sys.stderr, "\r*> Converting Images MetaData [%2.2f%s]" % ( float(lines)/totLines*100, '%'), print >> sys.stderr, "\r*> Converting Images MetaData: %d lines" % lines
#----------------------------------------- # Loading Stopword stopwordFile = file( stopwordsFilePath, 'r' ) stopworddiz = loadStopword( stopwordFile ) print >> sys.stderr, "*> [InputSet] Parsing tags", lines = 0 #totLines = 10152.0 totLines = 3195410.0 for line in sys.stdin: lines += 1 # blk = ['2408552791', 'La electricidad est\xc3\xa1tica', 'http://flickr.com/photos/39556080@N00/2408552791', # 'museo pelos pelo carina van de graaff', '-32.959093|-60.623738|Argentina|Santa Fe|Rosario', '39556080@N00|Rosario, Argentina'] blk = clean_line( line ).split("\t") # Extract tags and mtags tags, mtags = tagsfilters( blk[3] ) tags = tags.strip() mtags = mtags.strip() for tag in tags: if len(tag) < 2: continue # check if it is a stopword # for key in stopworddiz.keys(): # if tag in key or key in tag: if stopworddiz.has_key(tag): continue tags += tag +" " # clean tags
#----------------------------------------- # Loading Stopword stopwordFile = file(stopwordsFilePath, 'r') stopworddiz = loadStopword(stopwordFile) print >> sys.stderr, "*> [InputSet] Parsing tags", lines = 0 #totLines = 10152.0 totLines = 3195410.0 for line in sys.stdin: lines += 1 # blk = ['2408552791', 'La electricidad est\xc3\xa1tica', 'http://flickr.com/photos/39556080@N00/2408552791', # 'museo pelos pelo carina van de graaff', '-32.959093|-60.623738|Argentina|Santa Fe|Rosario', '39556080@N00|Rosario, Argentina'] blk = clean_line(line).split("\t") # Extract tags and mtags tags, mtags = tagsfilters(blk[3]) tags = tags.strip() mtags = mtags.strip() for tag in tags: if len(tag) < 2: continue # check if it is a stopword # for key in stopworddiz.keys(): # if tag in key or key in tag: if stopworddiz.has_key(tag): continue tags += tag + " " # clean tags