def main(): tweet_file = open(sys.argv[1]) dict_file = open(sys.argv[2]) # Construct the original scores' dictionary. scores = {} for line in dict_file: term, score = line.split("\t") scores[term] = int(score) newWords = {} # For every tweet do the following: # Step 1. Calculate scores according to the current state of the dictionary. # Step 2. Try to improve scores for words that were not manually labelled. for line in tweet_file: jline = json.loads(line) try: text = jline["text"] words = text.strip().split(" ") words = [ strip_accents(word.encode('ascii', 'ignore')) for word in words ] except: words = [] score = 0.0 cont = 0.0 for word in words: # Step 1. Manually scored words are favored. if word in scores.keys(): score += scores[word] cont += 1.0 elif word in newWords.keys(): score += newWords[word] cont += 1.0 if cont > 0: # Step 2. Simple update for new words. for word in words: if word in newWords.keys(): newWords[word] = .8 * newWords[word] + .2 * score / cont else: newWords[word] = score / cont for key in scores.keys(): print key + "\t" + str(scores[key]) for key in newWords.keys(): if not key in scores.keys(): print key + "\t" + str(newWords[key])
def main(): tweet_file = open(sys.argv[1]) dict_file = open(sys.argv[2]) # Construct the original scores' dictionary. scores = {} for line in dict_file: term, score = line.split("\t") scores[term] = int(score) newWords = {} # For every tweet do the following: # Step 1. Calculate scores according to the current state of the dictionary. # Step 2. Try to improve scores for words that were not manually labelled. for line in tweet_file: jline = json.loads(line) try: text = jline["text"] words = text.strip().split(" ") words = [strip_accents(word.encode('ascii', 'ignore')) for word in words] except: words = [] score = 0.0 cont = 0.0 for word in words: # Step 1. Manually scored words are favored. if word in scores.keys(): score += scores[word] cont += 1.0 elif word in newWords.keys(): score += newWords[word] cont += 1.0 if cont > 0: # Step 2. Simple update for new words. for word in words: if word in newWords.keys(): newWords[word] = .8*newWords[word] + .2*score/cont else: newWords[word] = score/cont for key in scores.keys(): print key + "\t" + str(scores[key]) for key in newWords.keys(): if not key in scores.keys(): print key + "\t" + str(newWords[key])
def main(): tweet_file = open(sys.argv[1]) dict_file = open(sys.argv[2]) # Construct the original scores' dictionary. scores = {} for line in dict_file: try: term, score = line.split("\t") scores[term] = float(score) except: pass newWords = {} # For every tweet do the following: # Step 1. Calculate scores according to the current dictionary. # Step 2. If the tweet was given a score, print the coordinates and the score. for line in tweet_file: jline = json.loads(line) try: text = jline["text"] words = text.split(" ") words = [ strip_accents(word.encode('ascii', 'ignore')) for word in words ] except: words = [] score = 0.0 cont = 0.0 for word in words: if word in scores.keys(): score += scores[word] cont += 1.0 if cont > 0: # Step 2. Location and score are printed. try: if jline["place"]["country_code"] == 'MX': text = jline["text"] x, y = jline["geo"]["coordinates"] name = jline["user"]["screen_name"] userId = "id" + jline["user"]["id_str"] print userId, ", ", y, ", ", x, ",", score except: pass
def main(): english_file = open(sys.argv[1]) spanish_file = open(sys.argv[2]) scores = {} for line in spanish_file: line = line.strip() term, score = next(english_file).split("\t") # For simplicity, avoid phrases and use only words. if len(line.split(" ")) == 2: word, score2 = line.split(" ") scores[strip_accents(word)] = int(score) for key in scores.keys(): print key + "\t" + str(scores[key])
def main(): english_file = open(sys.argv[1]) spanish_file = open(sys.argv[2]) scores = {} for line in spanish_file: line = line.strip() term, score = next(english_file).split("\t") # For simplicity, avoid phrases and use only words. if len(line.split(" ")) == 2: word, score2 = line.split(" ") scores[ strip_accents(word)] = int(score) for key in scores.keys(): print key + "\t" + str(scores[key])