def main(): if len(sys.argv) < 3: print 'please input city name, language and target date to specify the training data file' quit() city_name = sys.argv[1] lang = sys.argv[2] target_date = sys.argv[3] test_data_path = '/home/muga/twitter/test_data/retrieved_data/' + city_name + '/' cf.validate_directory(test_data_path) training_data_path = '/home/muga/twitter/original_trainingdata/' train_data = '' test_data = '' #tweet data obtained from search API for f in os.listdir(test_data_path): if f.endswith('.csv') and f.startswith( '') and target_date in f and lang in f: test_data = test_data_path + f break for f in os.listdir(training_data_path): if f.endswith('.csv') and 'merge' in f and lang in f: train_data = training_data_path + f break if lang == 'es' and 'training_data' in f and lang in f: train_data = training_data_path + f break print 'train data: ' + train_data print 'test data: ' + test_data confirm = raw_input( 'it is going to process these files. is it okay ? (yes/no)') if not confirm.lower() in 'yes': print 'abort this program' quit() classification(train_data, test_data)
def getCoordinates(city_name, keyword): data_path = '/home/muga/twitter/place_id_data/' + city_name + '/' cf.validate_directory(data_path) file_name = '' #print 'city name: ' + city_name for f in os.listdir(data_path): if city_name in f and f.endswith('txt') and f.startswith('coordinate') and keyword in f: file_name = data_path + f break if len(file_name) == 0: print 'The Input File is not Found. Abort' quit() input = open(file_name, 'r') s = input.readline() coodinate_dict = {} while s: #until the end of the file #print s#for debug splitted_line = s.split(',') if not len(splitted_line) == 3: s = input.readline() place_name = splitted_line[0] latitude = splitted_line[1] longtitude = splitted_line[2] coordinate = [latitude, longtitude] coodinate_dict[place_name] = coordinate s = input.readline() return coodinate_dict
def obtainTweetsFromStream(twitter_api, q, lang, emotion, max_results): kw = {} kw['track'] = q kw['language'] = lang twitter_stream = twitter.TwitterStream(auth=twitter_api.auth) tweets = make_twitter_request(twitter_stream.statuses.filter, **kw) #max_results = 200#can be modified date = time.strftime("%d%b%Y%H%M") file_name = date + "_" + lang + "_" + emotion + "_from_stream.txt" #this text file should be moved to another directory out_file_path = "/home/muga/twitter/tweets_from_stream/training/" cf.validate_directory(out_file_path, True) output = open(out_file_path + file_name, 'w') start_time = datetime.now() count = 0 for tweet in tweets: if 'text' in tweet: txt = tweet['text'] else: break #goes outside of this for loop if validateTweet(txt, emotion): s = json.dumps(tweet['text'], indent=1) #print s + ', ' + tweet['created_at'].encode('utf-8') output.write(s + ', ' + tweet['created_at'].encode('utf-8') + '\n') count += 1 if count % 100 == 0: print txt + ' : ' + str(count) + ' out of ' + str(max_results) if count > max_results: cf.write_exec_time(start_time, output) return print 'goes into the while loop' #while len(tweets) > 0 and count < max_results: while count < max_results: tweets = make_twitter_request(twitter_stream.statuses.filter, **kw) for tweet in tweets: if 'text' in tweet: txt = tweet['text'] else: break #goes outside of this for loop if validateTweet(txt, emotion): s = json.dumps(tweet['text'], indent=1) output.write(s + ', ' + tweet['created_at'].encode('utf-8') + '\n') count += 1 if count % 100 == 0: print txt + ' : ' + str(count) + ' out of ' + str( max_results) print 'goes out from the while loop' output.close() print 'Extracting ' + emotion + ' tweets of ' + lang + ' has done.' cf.write_exec_time(start_time, output) return
def main(): twitter_api = cf.authentication_twitter(sys.argv[-1]) argvs = sys.argv searched_country = '' if len(argvs) > 1: cityname = argvs[1] if len(sys.argv) > 2 and sys.argv[2].isalpha() == False: wait_time = int(sys.argv[2]) print 'Please wait for ' + sys.argv[2] + ' minutes' time.sleep(60*wait_time + 5) if 't' == argvs[2]:#argvs[2] has to contain 't' to make 'non_city_name' true. non_city_name = True print 'non_city_name is True' else: non_city_name = False accuracy = 10000 if cityname: result = twitter_api.geo.search(query=cityname) else: print cityname + ' ' + granularity + ' ' + latitude + ' ' + longtitude quit() places = result['result']['places'] place_id_dict = {} for place in places: place_full_name = place['full_name'] place_name = place['name'] place_id = place['id'] country = place['country'] if searched_country and validate_area(country, searched_country) == False: print 'fullname: ' + place_full_name + ', country: ' + country continue place_list = [place_full_name, place_name] place_id_dict[place_id] = place_list if non_city_name: out_file_path = "/home/muga/twitter/place_id_data/others/" else: cityname = cityname.replace(' ','') out_file_path = "/home/muga/twitter/place_id_data/" + cityname + "/" file_name = "placeid_" + cityname + "_keyword_search.txt" file_name = file_name.replace(' ', '') cf.validate_directory(out_file_path) output = open(out_file_path + file_name, 'w') output.write('keyword: ' + cityname + '\n') for p_id in place_id_dict: #print p + " ----> id: " + place_dict[p] + "\n" s = p_id + ": " + place_id_dict[p_id][0] + ": " + place_id_dict[p_id][1]+ "\n" #decode s as UTF-8 string output.write(s.encode('utf-8')) output.close()
def main(): if len(sys.argv) < 3: print 'the input must have emotion (pos, neg or neu) and target date of the text file)' quit() emotion = sys.argv[1] #pos, neg, neu target_date = sys.argv[2] if not emotion in ['pos', 'neg', 'neu']: print emotion + ' is wrong for input' quit() train_datas_path = '/home/muga/twitter/tweets_from_stream/training/' train_data_files = [] for f in os.listdir(train_datas_path): if not target_date == '' and target_date in f and emotion in f: train_data_files.append(f) print 'append ' + f elif target_date == '' and f.endswith('.txt') and emotion in f: train_data_files.append(f) print 'append ' + f if len(train_data_files) == 0: print 'Not Found' quit() confirm = raw_input( 'it is going to process these files. is it okay ? (yes/no) ') if not confirm.lower() in 'yes': print 'cancel' quit() out_path = '/home/muga/twitter/new_trainingdata/' #out_path = '/home/muga/twitter/new_trainingdata/debug/' cf.validate_directory(out_path, True) for f in train_data_files: input_file = open(train_datas_path + f) lines_of_tweet = input_file.readlines() #validate each sentence again. #in neutral tweets (news accounts) each tweet has URL therefore the URLs should be removed. #put affected value pos -> 0, neg -> 1, neu -> 2 ? #write lang = cf.find_lang(f) output_file = open( out_path + f.split('_')[0] + '_' + lang + '_' + emotion + '_train_data.csv', 'wb') writeToFile(lines_of_tweet, output_file, emotion) output_file.close() return
def main(): twitter_api = cf.authentication_twitter(sys.argv[-1]) if len(sys.argv) >= 2: cityname = sys.argv[1] if len(sys.argv) > 2 and sys.argv[2].isalpha() == False: wait_time = int(sys.argv[2]) print 'Please wait for ' + sys.argv[2] + ' minutes' time.sleep(60*wait_time + 5) else: cityname = raw_input('query (London, NewYork etc)=' ) #print "granularity (poi, neighborhood, city, admin, country) = ", if len(sys.argv) < 2 and cityname == 'London': print 'Please Input Keyword. e.g. centre, outer' quit() if cityname == 'London': keyword = sys.argv[1] else: keyword = '' coodinate_dict = getCoordinates(cityname, keyword) count_invoke = 0 out_file_path = "/home/muga/twitter/place_id_data/" + cityname + '/' cf.validate_directory(out_file_path) if keyword: file_name = "placeid_" + keyword + "_" + cityname + "_coordinate_search.txt" else: file_name = "placeid_" + cityname + "_coordinate_search.txt" output = open(out_file_path + file_name, 'w') for place_name in coodinate_dict.keys(): coordinate = coodinate_dict[place_name] latitude = float(coordinate[0]) longtitude = float(coordinate[1]) if count_invoke == 13: print 'Sleep for 15 minutes...zz....' time.sleep(60*15 + 5) print '..zz...Awake !! Restart' count_invoke = 0 places_info = geo_search(twitter_api, place_name, latitude, longtitude) count_invoke += 1 output.write('place name: ' + place_name + ', latitude: ' + str(latitude) + ', longtitude: ' + str(longtitude) + "\n") write_place_info(places_info, place_name, output) output.close()
def classification(filename_train, filename_test): label_train, feat_vec_train, feat_vec_test = getFeatureVecsAndLabel( filename_train, filename_test) print 'data extraction has done' scores = ['accuracy', 'precision', 'recall'] out_file_name = filename_test.split('/')[-1] + '.txt' city_name = sys.argv[1] out_file_path = "/home/muga/twitter/classification_result/random_forest/" + city_name + '/' cf.validate_directory(out_file_path) out = open(out_file_path + out_file_name, 'a') for score in scores: out.write('\n' + '-' * 50) out.write(score) out.write('-' * 50) tuned_parameters = [{ 'n_estimators': [10, 30, 50, 70, 90, 110, 130, 150], 'max_features': ['auto', 'sqrt', 'log2', None] }] clf = GridSearchCV(RandomForestClassifier(), param_grid=tuned_parameters, cv=3, scoring=score, n_jobs=-1) clf.fit(feat_vec_train, label_train) print clf.best_estimator_ y_pred = clf.predict(feat_vec_test) showResult(score, y_pred, out) print 'loop for ' + score + ' has done\n' out.close() print "classification of " + filename_train + " has done" return
def main(): if len(sys.argv) < 2: print 'please input city. And if neccesary enter the way of classification, language and target date to specify the training data file' quit() if len(sys.argv) >= 3 and sys.argv[-1] == 'all': strategy = 'all' else: clf_strategy = raw_input( 'One against One (0), One against The Rest (1) or Random Forest (2) ----> ' ) if clf_strategy == str(0): strategy = 'one_against_one' elif clf_strategy == str(1): strategy = 'one_against_the_rest' elif clf_strategy == str(2): strategy = 'random_forest' else: print 'wrong input ' + clf_strategy quit() strategies = ['one_against_one', 'one_against_the_rest', 'random_forest'] city_name = sys.argv[1] if len(sys.argv) == 3 or (len(sys.argv) == 4 and sys.argv[3] == 'all'): if sys.argv[2] in ['de', 'en', 'es', 'fr', 'pr']: lang = sys.argv[2] target_date = '' else: target_date = sys.argv[2] lang = '' elif len(sys.argv) >= 4: lang = sys.argv[2] target_date = sys.argv[3] print 'target_date: ' + target_date else: lang = '' target_date = '' test_data_path = '/home/muga/twitter/test_data/retrieved_data/' + city_name + '/' cf.validate_directory(test_data_path) training_data_path = '/home/muga/twitter/new_trainingdata/' train_data_dict = {} test_data_list = [] #tweet data obtained from search API for f in os.listdir(test_data_path): if not 'uniq' in f: #test file should not contain duplicate lines continue if f.endswith('.csv') and f.startswith( '') and target_date in f and lang in f: if lang and check_lang_file(f, lang): test_data_list.append(test_data_path + f) elif not lang: test_data_list.append(test_data_path + f) for f in os.listdir(training_data_path): if f.endswith('.csv') and 'new' in f: language = cf.find_lang(f) print 'language', language train_data_dict[language] = training_data_path + f if len(test_data_list) == 0: print 'Not Found' quit() print 'list of test data: ' print '\n'.join(test_data_list) print 'list of training data: ' for k in train_data_dict: print k, train_data_dict[k] if strategy == 'all': confirm = raw_input( "it's going to do all classifications. Is it Okay ? (yes/no)") else: confirm = raw_input( 'It is going to process these files with %s. Is it okay ? (yes/no)' % strategy) if not confirm.lower() in 'yes': print 'cancel' quit() for test_data in test_data_list: if lang: language = lang else: language = cf.find_lang(test_data.split('/')[-1]) train_data = train_data_dict[language] print 'train data: ' + train_data print 'test on ' + test_data if strategy == 'all': for s in strategies: classification(train_data, test_data, s) else: classification(train_data, test_data, strategy)
def classification(filename_train, filename_test, strategy): label_train, feat_vec_train, feat_vec_test, tweets = getFeatureVecsAndLabel( filename_train, filename_test) print 'data extraction has done' scores = ['accuracy', 'precision_micro', 'recall_weighted', 'f1_micro'] date = time.strftime("%d%b%Y%H%M") out_file_name = filename_test.split('/')[-1] + '_' + date + '.txt' out_file_name = out_file_name.replace('.csv', '') city_name = sys.argv[1] out_file_path = "/home/muga/twitter/classification_result/training_data_emoji/" + city_name + "/" + strategy + '/' cf.validate_directory(out_file_path, True) lang = cf.find_lang(filename_test.split('/')[-1]) another_out_file_path = out_file_path + lang + '/' cf.validate_directory(another_out_file_path, True) out = open(out_file_path + out_file_name, 'a') start_time = datetime.now() print '-' * 10 + strategy + ' ' + filename_test + '-' * 10 for score in scores: if cf.skip_parameter(score, strategy, lang): continue out.write('\n' + '-' * 50) out.write(score) out.write('-' * 50) if strategy == 'one_against_the_rest': tuned_parameters = { 'C': [1, 10, 100, 1000], 'tol': [1e-3, 1e-4], 'multi_class': ['ovr', 'crammer_singer'] } clf = GridSearchCV(LinearSVC(C=1), param_grid=tuned_parameters, cv=5, scoring=score, n_jobs=-1) elif strategy == 'one_against_one': tuned_parameters = [{ 'kernel': ['rbf'], 'gamma': [1e-3, 1e-4], 'C': [1, 10, 100, 1000] }, { 'kernel': ['linear'], 'C': [1, 10, 100, 1000] }] clf = GridSearchCV(SVC(C=1), param_grid=tuned_parameters, cv=5, scoring=score, n_jobs=-1) elif strategy == 'random_forest': tuned_parameters = [{ 'n_estimators': [10, 30, 50, 70, 90, 110, 130, 150], 'max_features': ['auto', 'sqrt', 'log2', None] }] clf = GridSearchCV(RandomForestClassifier(), param_grid=tuned_parameters, cv=3, scoring=score, n_jobs=-1) else: print strategy + ' is wrong' quit() clf.fit(feat_vec_train, label_train) print clf.best_estimator_ y_pred = clf.predict(feat_vec_test) showResult(score, y_pred, out) writeResult(score, y_pred, tweets, out_file_name, another_out_file_path) print 'loop for ' + score + ' has done\n' cf.write_exec_time(start_time, out) out.close() print "classification of " + filename_test + " has done" print '-' * 30 return
def classification(filename, strategy): labels, feature_vec = getFeatureVecAndLabel(filename) data_train, data_test, label_train, label_test = train_test_split( feature_vec, labels, test_size=0.2) print 'data extraction has done' scores = ['accuracy', 'precision_micro', 'recall_weighted', 'f1_micro'] date = time.strftime('%d%b%Y%H%M') out_file_name = filename.split('/')[-1].split( '.')[0] + '_' + date + '_' + strategy + '.txt' out_file_path = "/home/muga/twitter/classification_result/classifier_evaluation/training_data_emoji/" cf.validate_directory(out_file_path, True) out = open(out_file_path + out_file_name, 'a') start_time = datetime.now() for score in scores: out.write('\n' + '-' * 50) out.write(score) out.write('-' * 50) if strategy == 'one_against_the_rest': tuned_parameters = { 'C': [1, 10, 100, 1000], 'tol': [1e-3, 1e-4], 'multi_class': ['ovr', 'crammer_singer'] } clf = GridSearchCV(LinearSVC(C=1), param_grid=tuned_parameters, cv=5, scoring=score, n_jobs=-1) elif strategy == 'one_against_one': tuned_parameters = [{ 'kernel': ['rbf'], 'gamma': [1e-3, 1e-4], 'C': [1, 10, 100, 1000] }, { 'kernel': ['linear'], 'C': [1, 10, 100, 1000] }] clf = GridSearchCV(SVC(C=1), param_grid=tuned_parameters, cv=5, scoring=score, n_jobs=-1) elif strategy == 'random_forest': tuned_parameters = [{ 'n_estimators': [10, 30, 50, 70, 90, 110, 130, 150], 'max_features': ['auto', 'sqrt', 'log2', None] }] clf = GridSearchCV(RandomForestClassifier(), param_grid=tuned_parameters, cv=3, scoring=score, n_jobs=-1) clf.fit(data_train, label_train) print clf.best_estimator_ y_true, y_pred = label_test, clf.predict(data_test) out.write(classification_report(y_true, y_pred)) print 'loop for ' + score + ' has done\n' cf.write_exec_time(start_time, out) out.close() print "classification of " + filename + " with " + strategy + " has done" return
continue place_list = [place_full_name, place_name] place_id_dict[place_id] = place_list if latitude and longtitude: out_file_path = "/home/muga/twitter/place_id_data/" area_range = accuracy / 1000 #km file_name = "placeid_" + latitude + "_" + longtitude + "_" + str(area_range) + "km_"+ ".txt" else: cityname = cityname.replace(' ','') out_file_path = "/home/muga/twitter/place_id_data/" + cityname + "/" if granularity: file_name = "placeid_" + cityname + "_" + granularity + ".txt" else: file_name = "placeid_" + cityname + "_keyword_search.txt" file_name = file_name.replace(' ', '') cf.validate_directory(out_file_path) output = open(out_file_path + file_name, 'w') output.write('city name: ' + cityname + ', granularity: ' + granularity + ', latitude: ' + latitude + ', longtitude: ' + longtitude + "\n") for p_id in place_id_dict: #print p + " ----> id: " + place_dict[p] + "\n" s = p_id + ": " + place_id_dict[p_id][0] + ": " + place_id_dict[p_id][1]+ "\n" #decode s as UTF-8 string output.write(s.encode('utf-8')) output.close() if __name__ == '__main__': main()