Example #1
0
 def saveTags(self, strTags, tag2code):
     codeList = []
     tagStrList = strTags.split(" ")
     for tstr in tagStrList:
         tstr = clean_line(tstr)
         # check if the tag exist
         if tag2code.has_key(tstr):
             codeList.append(tag2code[tstr])
     return codeList
	def saveTags(self, strTags, tag2code):
		codeList = []
		tagStrList = strTags.split(" ")
		for tstr in tagStrList:
			tstr = clean_line( tstr )
			# check if the tag exist
			if tag2code.has_key( tstr ):
				codeList.append( tag2code[tstr] )
		return codeList
Example #3
0
 def saveOwnerInfo(self, ownStr, tag2code):
     ''' Owner location: ['14678786@N00', 'milwaukee, United States'] '''
     hometown = []
     hometownTags = []
     blk = ownStr.split("|")
     if len(blk) > 1:
         for hw in ownStr.split("|")[1].split(","):
             hw_clear = clean_line(hw)
             hometown.append(hw_clear)
             if tag2code.has_key(hw_clear):
                 hometownTags.append(tag2code[hw_clear])
     return ownStr.split("|")[0], hometown, hometownTags
	def saveOwnerInfo(self, ownStr, tag2code):
		''' Owner location: ['14678786@N00', 'milwaukee, United States'] '''
		hometown = []
		hometownTags = []
		blk = ownStr.split("|")
		if len(blk) > 1:
			for hw in ownStr.split("|")[1].split(","):
				hw_clear = clean_line(hw)
				hometown.append( hw_clear )
				if tag2code.has_key( hw_clear ):
					hometownTags.append( tag2code[hw_clear] )
		return ownStr.split("|")[0], hometown, hometownTags
# 3: tags
# 4: timestamp
# 5: timestamp

# example of output line:
#4876969322	Boats on The Thames at Penton Hook Lock	http://www.flickr.com/photos/37413900@N04/4876969322/	thames pleasure boats penton hook lock gates laleham staines surrey barge	51.414745|-0.500274|16|Staines|Surrey|Angleterre|Royaume Uni|16	37413900@N04|Greater London, England United Kingdom
# 0: imageId
# 1: Title
# 2: url
# 3: tags
# 4: coordinates
# 5: userId|hometown

totLines = 3185258.0
lines = 0
print >> sys.stderr, "*> Convert Images MetaData",
for line in sys.stdin:
	lines += 1
	blk = line.strip().split(" : ")
	usrId, imgId = blk[0].split( "/" )
	title = ""
	url = blk[1]
	lat, lon, acc = getGeoDataFromImageMeta( blk[2] )
	coord = lat +"|"+ lon +"|"+ acc 
	tags = clean_line( blk[3] )
	print >> sys.stdout, "%s\t%s\t%s\t%s\t%s\t%s" % (imgId.encode('utf-8'), title, url.encode('utf-8'), tags.encode('utf-8'), coord.encode('utf-8'), usrId.encode('utf-8'))
	if lines % 200000:
		print >> sys.stderr, "\r*> Converting Images MetaData [%2.2f%s]" % ( float(lines)/totLines*100, '%'),
print >> sys.stderr, "\r*> Converting Images MetaData: %d lines" % lines

#-----------------------------------------

# Loading Stopword
stopwordFile = file( stopwordsFilePath, 'r' )
stopworddiz = loadStopword( stopwordFile )

print >> sys.stderr, "*> [InputSet] Parsing tags",
lines = 0
#totLines = 10152.0
totLines = 3195410.0
for line in sys.stdin:
	lines += 1
	# blk = ['2408552791', 'La electricidad est\xc3\xa1tica', 'http://flickr.com/photos/39556080@N00/2408552791',
	# 'museo pelos pelo carina van de graaff', '-32.959093|-60.623738|Argentina|Santa Fe|Rosario', '39556080@N00|Rosario, Argentina']
	blk = clean_line( line ).split("\t")
	# Extract tags and mtags
	tags, mtags = tagsfilters( blk[3] )
	tags = tags.strip()
	mtags = mtags.strip()
	
	for tag in tags:
		if len(tag) < 2:
			continue
		# check if it is a stopword
		# for key in stopworddiz.keys():
		# 	if tag in key or key in tag:
		if stopworddiz.has_key(tag):
			continue
		tags += tag +" "
	# clean tags
Example #7
0
#-----------------------------------------

# Loading Stopword
stopwordFile = file(stopwordsFilePath, 'r')
stopworddiz = loadStopword(stopwordFile)

print >> sys.stderr, "*> [InputSet] Parsing tags",
lines = 0
#totLines = 10152.0
totLines = 3195410.0
for line in sys.stdin:
    lines += 1
    # blk = ['2408552791', 'La electricidad est\xc3\xa1tica', 'http://flickr.com/photos/39556080@N00/2408552791',
    # 'museo pelos pelo carina van de graaff', '-32.959093|-60.623738|Argentina|Santa Fe|Rosario', '39556080@N00|Rosario, Argentina']
    blk = clean_line(line).split("\t")
    # Extract tags and mtags
    tags, mtags = tagsfilters(blk[3])
    tags = tags.strip()
    mtags = mtags.strip()

    for tag in tags:
        if len(tag) < 2:
            continue
        # check if it is a stopword
        # for key in stopworddiz.keys():
        # 	if tag in key or key in tag:
        if stopworddiz.has_key(tag):
            continue
        tags += tag + " "
    # clean tags