コード例 #1
0
def main():
    if len(sys.argv) < 3:
        print 'please input city name,  language and target date to specify the training data file'
        quit()
    city_name = sys.argv[1]
    lang = sys.argv[2]
    target_date = sys.argv[3]
    test_data_path = '/home/muga/twitter/test_data/retrieved_data/' + city_name + '/'
    cf.validate_directory(test_data_path)
    training_data_path = '/home/muga/twitter/original_trainingdata/'
    train_data = ''
    test_data = ''  #tweet data obtained from search API
    for f in os.listdir(test_data_path):
        if f.endswith('.csv') and f.startswith(
                '') and target_date in f and lang in f:
            test_data = test_data_path + f
            break
    for f in os.listdir(training_data_path):
        if f.endswith('.csv') and 'merge' in f and lang in f:
            train_data = training_data_path + f
            break
        if lang == 'es' and 'training_data' in f and lang in f:
            train_data = training_data_path + f
            break

    print 'train data: ' + train_data
    print 'test data: ' + test_data

    confirm = raw_input(
        'it is going to process these files. is it okay ? (yes/no)')
    if not confirm.lower() in 'yes':
        print 'abort this program'
        quit()

    classification(train_data, test_data)
コード例 #2
0
def getCoordinates(city_name, keyword):
	data_path = '/home/muga/twitter/place_id_data/' + city_name + '/'
	cf.validate_directory(data_path)
	file_name = ''
	#print 'city name: ' + city_name
	for f in os.listdir(data_path):
		if city_name in f and f.endswith('txt') and f.startswith('coordinate') and keyword in f:
			file_name = data_path + f
			break
	if len(file_name) == 0:
		print 'The Input File is not Found. Abort'
		quit()

	input = open(file_name, 'r')
	s = input.readline()

	coodinate_dict = {}
	while s: #until the end of the file
		#print s#for debug
		splitted_line =  s.split(',')
		if not len(splitted_line) == 3:
			s = input.readline()
		place_name = splitted_line[0]
		latitude = splitted_line[1]
		longtitude = splitted_line[2]
		coordinate = [latitude, longtitude]
		coodinate_dict[place_name] = coordinate
		s = input.readline()
	return coodinate_dict
コード例 #3
0
def obtainTweetsFromStream(twitter_api, q, lang, emotion, max_results):
    kw = {}
    kw['track'] = q
    kw['language'] = lang
    twitter_stream = twitter.TwitterStream(auth=twitter_api.auth)
    tweets = make_twitter_request(twitter_stream.statuses.filter, **kw)
    #max_results = 200#can be modified

    date = time.strftime("%d%b%Y%H%M")
    file_name = date + "_" + lang + "_" + emotion + "_from_stream.txt"  #this text file should be moved to another directory
    out_file_path = "/home/muga/twitter/tweets_from_stream/training/"
    cf.validate_directory(out_file_path, True)
    output = open(out_file_path + file_name, 'w')

    start_time = datetime.now()
    count = 0
    for tweet in tweets:
        if 'text' in tweet:
            txt = tweet['text']
        else:
            break  #goes outside of this for loop
        if validateTweet(txt, emotion):
            s = json.dumps(tweet['text'], indent=1)
            #print s + ', ' + tweet['created_at'].encode('utf-8')
            output.write(s + ', ' + tweet['created_at'].encode('utf-8') + '\n')
            count += 1
            if count % 100 == 0:
                print txt + ' : ' + str(count) + ' out of ' + str(max_results)
        if count > max_results:
            cf.write_exec_time(start_time, output)
            return

    print 'goes into the while loop'
    #while len(tweets) > 0 and count < max_results:
    while count < max_results:
        tweets = make_twitter_request(twitter_stream.statuses.filter, **kw)
        for tweet in tweets:
            if 'text' in tweet:
                txt = tweet['text']
            else:
                break  #goes outside of this for loop
            if validateTweet(txt, emotion):
                s = json.dumps(tweet['text'], indent=1)
                output.write(s + ', ' + tweet['created_at'].encode('utf-8') +
                             '\n')
                count += 1
                if count % 100 == 0:
                    print txt + ' : ' + str(count) + ' out of ' + str(
                        max_results)
    print 'goes out from the while loop'
    output.close()
    print 'Extracting ' + emotion + ' tweets of ' + lang + ' has done.'

    cf.write_exec_time(start_time, output)
    return
コード例 #4
0
def main():
	twitter_api = cf.authentication_twitter(sys.argv[-1])
	argvs = sys.argv
	searched_country = ''
	if len(argvs) > 1:
		cityname = argvs[1]
		if len(sys.argv) > 2 and sys.argv[2].isalpha() == False:
			wait_time = int(sys.argv[2])
			print 'Please wait for ' + sys.argv[2] + ' minutes'
			time.sleep(60*wait_time + 5)

		if 't' == argvs[2]:#argvs[2] has to contain 't' to make 'non_city_name' true.
			non_city_name = True
			print 'non_city_name is True'
		else:
			non_city_name = False
	accuracy = 10000
	if cityname:
		result = twitter_api.geo.search(query=cityname)
	else:
		print cityname + ' ' + granularity + ' ' + latitude + ' ' + longtitude
		quit()

	places = result['result']['places']
	place_id_dict = {}
	for place in places:
		place_full_name = place['full_name']
		place_name = place['name']
		place_id = place['id']
		country = place['country']
		if searched_country and validate_area(country, searched_country) == False:
			print 'fullname: ' + place_full_name + ', country: ' + country
			continue
		place_list = [place_full_name, place_name]
		place_id_dict[place_id] = place_list

	if non_city_name:
		out_file_path = "/home/muga/twitter/place_id_data/others/"
	else:
		cityname = cityname.replace(' ','')
		out_file_path = "/home/muga/twitter/place_id_data/" + cityname + "/"
	file_name = "placeid_" + cityname + "_keyword_search.txt"
	file_name = file_name.replace(' ', '')
	cf.validate_directory(out_file_path)
	output = open(out_file_path + file_name, 'w')

	output.write('keyword: ' + cityname + '\n')
	for p_id in place_id_dict:
		#print p + " ----> id:  " + place_dict[p] + "\n"
		s = p_id + ": " + place_id_dict[p_id][0] + ": " + place_id_dict[p_id][1]+ "\n"
		#decode s as UTF-8 string
		output.write(s.encode('utf-8'))
	output.close()
コード例 #5
0
def main():
    if len(sys.argv) < 3:
        print 'the input must have emotion (pos, neg or neu) and target date of the text file)'
        quit()

    emotion = sys.argv[1]  #pos, neg, neu
    target_date = sys.argv[2]
    if not emotion in ['pos', 'neg', 'neu']:
        print emotion + ' is wrong for input'
        quit()

    train_datas_path = '/home/muga/twitter/tweets_from_stream/training/'
    train_data_files = []
    for f in os.listdir(train_datas_path):
        if not target_date == '' and target_date in f and emotion in f:
            train_data_files.append(f)
            print 'append ' + f
        elif target_date == '' and f.endswith('.txt') and emotion in f:
            train_data_files.append(f)
            print 'append ' + f

    if len(train_data_files) == 0:
        print 'Not Found'
        quit()

    confirm = raw_input(
        'it is going to process these files. is it okay ? (yes/no) ')
    if not confirm.lower() in 'yes':
        print 'cancel'
        quit()

    out_path = '/home/muga/twitter/new_trainingdata/'
    #out_path = '/home/muga/twitter/new_trainingdata/debug/'
    cf.validate_directory(out_path, True)

    for f in train_data_files:
        input_file = open(train_datas_path + f)
        lines_of_tweet = input_file.readlines()
        #validate each sentence again.
        #in neutral tweets (news accounts) each tweet has URL therefore the URLs should be removed.
        #put affected value pos -> 0, neg -> 1, neu -> 2 ?
        #write
        lang = cf.find_lang(f)
        output_file = open(
            out_path + f.split('_')[0] + '_' + lang + '_' + emotion +
            '_train_data.csv', 'wb')
        writeToFile(lines_of_tweet, output_file, emotion)
        output_file.close()
    return
コード例 #6
0
def main():
	twitter_api = cf.authentication_twitter(sys.argv[-1])
	if len(sys.argv) >= 2:
		cityname = sys.argv[1]
		if len(sys.argv) > 2 and sys.argv[2].isalpha() == False:
			wait_time = int(sys.argv[2])
			print 'Please wait for ' + sys.argv[2] + ' minutes'
			time.sleep(60*wait_time + 5)

	else:
		cityname = raw_input('query (London, NewYork etc)=' )
	#print "granularity (poi, neighborhood, city, admin, country) = ",
	if len(sys.argv) < 2 and cityname == 'London':
		print 'Please Input Keyword. e.g. centre, outer'
		quit()
	if cityname == 'London':
		keyword = sys.argv[1]
	else:
		keyword = ''

	coodinate_dict = getCoordinates(cityname, keyword)
	count_invoke = 0
	out_file_path = "/home/muga/twitter/place_id_data/" + cityname + '/'
	cf.validate_directory(out_file_path)
	if keyword:
		file_name = "placeid_" + keyword + "_" + cityname + "_coordinate_search.txt"
	else:
		file_name = "placeid_" + cityname + "_coordinate_search.txt"
	output = open(out_file_path + file_name, 'w')

	for place_name in coodinate_dict.keys():
		coordinate = coodinate_dict[place_name] 
		latitude = float(coordinate[0])
		longtitude = float(coordinate[1])
		if count_invoke == 13:
			print 'Sleep for 15 minutes...zz....' 
			time.sleep(60*15 + 5)
			print '..zz...Awake !! Restart'
			count_invoke = 0
		places_info = geo_search(twitter_api, place_name, latitude, longtitude)
		count_invoke += 1
		output.write('place name: ' + place_name + ', latitude: ' + str(latitude) + ', longtitude: ' + str(longtitude) + "\n")
		write_place_info(places_info, place_name, output) 

	output.close()
コード例 #7
0
def classification(filename_train, filename_test):
    label_train, feat_vec_train, feat_vec_test = getFeatureVecsAndLabel(
        filename_train, filename_test)
    print 'data extraction has done'
    scores = ['accuracy', 'precision', 'recall']

    out_file_name = filename_test.split('/')[-1] + '.txt'
    city_name = sys.argv[1]
    out_file_path = "/home/muga/twitter/classification_result/random_forest/" + city_name + '/'
    cf.validate_directory(out_file_path)

    out = open(out_file_path + out_file_name, 'a')

    for score in scores:
        out.write('\n' + '-' * 50)
        out.write(score)
        out.write('-' * 50)
        tuned_parameters = [{
            'n_estimators': [10, 30, 50, 70, 90, 110, 130, 150],
            'max_features': ['auto', 'sqrt', 'log2', None]
        }]
        clf = GridSearchCV(RandomForestClassifier(),
                           param_grid=tuned_parameters,
                           cv=3,
                           scoring=score,
                           n_jobs=-1)
        clf.fit(feat_vec_train, label_train)
        print clf.best_estimator_

        y_pred = clf.predict(feat_vec_test)
        showResult(score, y_pred, out)
        print 'loop for ' + score + ' has done\n'

    out.close()
    print "classification of " + filename_train + " has done"
    return
コード例 #8
0
def main():
    if len(sys.argv) < 2:
        print 'please input city. And if neccesary enter the way of classification, language and  target date to specify the training data file'
        quit()

    if len(sys.argv) >= 3 and sys.argv[-1] == 'all':
        strategy = 'all'
    else:
        clf_strategy = raw_input(
            'One against One (0), One against The Rest (1) or Random Forest (2)  ----> '
        )
        if clf_strategy == str(0):
            strategy = 'one_against_one'
        elif clf_strategy == str(1):
            strategy = 'one_against_the_rest'
        elif clf_strategy == str(2):
            strategy = 'random_forest'
        else:
            print 'wrong input ' + clf_strategy
            quit()
    strategies = ['one_against_one', 'one_against_the_rest', 'random_forest']
    city_name = sys.argv[1]
    if len(sys.argv) == 3 or (len(sys.argv) == 4 and sys.argv[3] == 'all'):
        if sys.argv[2] in ['de', 'en', 'es', 'fr', 'pr']:
            lang = sys.argv[2]
            target_date = ''
        else:
            target_date = sys.argv[2]
            lang = ''
    elif len(sys.argv) >= 4:
        lang = sys.argv[2]
        target_date = sys.argv[3]
        print 'target_date: ' + target_date
    else:
        lang = ''
        target_date = ''
    test_data_path = '/home/muga/twitter/test_data/retrieved_data/' + city_name + '/'
    cf.validate_directory(test_data_path)
    training_data_path = '/home/muga/twitter/new_trainingdata/'
    train_data_dict = {}
    test_data_list = []  #tweet data obtained from search API
    for f in os.listdir(test_data_path):
        if not 'uniq' in f:  #test file should not contain duplicate lines
            continue

        if f.endswith('.csv') and f.startswith(
                '') and target_date in f and lang in f:
            if lang and check_lang_file(f, lang):
                test_data_list.append(test_data_path + f)
            elif not lang:
                test_data_list.append(test_data_path + f)

    for f in os.listdir(training_data_path):
        if f.endswith('.csv') and 'new' in f:
            language = cf.find_lang(f)
            print 'language', language
            train_data_dict[language] = training_data_path + f

    if len(test_data_list) == 0:
        print 'Not Found'
        quit()

    print 'list of test data: '
    print '\n'.join(test_data_list)

    print 'list of training data: '
    for k in train_data_dict:
        print k, train_data_dict[k]

    if strategy == 'all':
        confirm = raw_input(
            "it's going to do all classifications. Is it Okay ? (yes/no)")
    else:
        confirm = raw_input(
            'It is going to process these files with %s. Is it okay ? (yes/no)'
            % strategy)
    if not confirm.lower() in 'yes':
        print 'cancel'
        quit()
    for test_data in test_data_list:
        if lang:
            language = lang
        else:
            language = cf.find_lang(test_data.split('/')[-1])
        train_data = train_data_dict[language]
        print 'train data: ' + train_data
        print 'test on ' + test_data
        if strategy == 'all':
            for s in strategies:
                classification(train_data, test_data, s)
        else:
            classification(train_data, test_data, strategy)
コード例 #9
0
def classification(filename_train, filename_test, strategy):
    label_train, feat_vec_train, feat_vec_test, tweets = getFeatureVecsAndLabel(
        filename_train, filename_test)
    print 'data extraction has done'
    scores = ['accuracy', 'precision_micro', 'recall_weighted', 'f1_micro']

    date = time.strftime("%d%b%Y%H%M")
    out_file_name = filename_test.split('/')[-1] + '_' + date + '.txt'
    out_file_name = out_file_name.replace('.csv', '')
    city_name = sys.argv[1]
    out_file_path = "/home/muga/twitter/classification_result/training_data_emoji/" + city_name + "/" + strategy + '/'
    cf.validate_directory(out_file_path, True)
    lang = cf.find_lang(filename_test.split('/')[-1])
    another_out_file_path = out_file_path + lang + '/'
    cf.validate_directory(another_out_file_path, True)

    out = open(out_file_path + out_file_name, 'a')
    start_time = datetime.now()

    print '-' * 10 + strategy + ' ' + filename_test + '-' * 10
    for score in scores:
        if cf.skip_parameter(score, strategy, lang):
            continue
        out.write('\n' + '-' * 50)
        out.write(score)
        out.write('-' * 50)

        if strategy == 'one_against_the_rest':
            tuned_parameters = {
                'C': [1, 10, 100, 1000],
                'tol': [1e-3, 1e-4],
                'multi_class': ['ovr', 'crammer_singer']
            }
            clf = GridSearchCV(LinearSVC(C=1),
                               param_grid=tuned_parameters,
                               cv=5,
                               scoring=score,
                               n_jobs=-1)
        elif strategy == 'one_against_one':
            tuned_parameters = [{
                'kernel': ['rbf'],
                'gamma': [1e-3, 1e-4],
                'C': [1, 10, 100, 1000]
            }, {
                'kernel': ['linear'],
                'C': [1, 10, 100, 1000]
            }]
            clf = GridSearchCV(SVC(C=1),
                               param_grid=tuned_parameters,
                               cv=5,
                               scoring=score,
                               n_jobs=-1)
        elif strategy == 'random_forest':
            tuned_parameters = [{
                'n_estimators': [10, 30, 50, 70, 90, 110, 130, 150],
                'max_features': ['auto', 'sqrt', 'log2', None]
            }]
            clf = GridSearchCV(RandomForestClassifier(),
                               param_grid=tuned_parameters,
                               cv=3,
                               scoring=score,
                               n_jobs=-1)
        else:
            print strategy + ' is wrong'
            quit()
        clf.fit(feat_vec_train, label_train)
        print clf.best_estimator_

        y_pred = clf.predict(feat_vec_test)

        showResult(score, y_pred, out)
        writeResult(score, y_pred, tweets, out_file_name,
                    another_out_file_path)
        print 'loop for ' + score + ' has done\n'

    cf.write_exec_time(start_time, out)
    out.close()
    print "classification of " + filename_test + " has done"
    print '-' * 30
    return
コード例 #10
0
def classification(filename, strategy):
    labels, feature_vec = getFeatureVecAndLabel(filename)
    data_train, data_test, label_train, label_test = train_test_split(
        feature_vec, labels, test_size=0.2)

    print 'data extraction has done'
    scores = ['accuracy', 'precision_micro', 'recall_weighted', 'f1_micro']

    date = time.strftime('%d%b%Y%H%M')
    out_file_name = filename.split('/')[-1].split(
        '.')[0] + '_' + date + '_' + strategy + '.txt'
    out_file_path = "/home/muga/twitter/classification_result/classifier_evaluation/training_data_emoji/"
    cf.validate_directory(out_file_path, True)

    out = open(out_file_path + out_file_name, 'a')
    start_time = datetime.now()
    for score in scores:
        out.write('\n' + '-' * 50)
        out.write(score)
        out.write('-' * 50)
        if strategy == 'one_against_the_rest':
            tuned_parameters = {
                'C': [1, 10, 100, 1000],
                'tol': [1e-3, 1e-4],
                'multi_class': ['ovr', 'crammer_singer']
            }
            clf = GridSearchCV(LinearSVC(C=1),
                               param_grid=tuned_parameters,
                               cv=5,
                               scoring=score,
                               n_jobs=-1)
        elif strategy == 'one_against_one':
            tuned_parameters = [{
                'kernel': ['rbf'],
                'gamma': [1e-3, 1e-4],
                'C': [1, 10, 100, 1000]
            }, {
                'kernel': ['linear'],
                'C': [1, 10, 100, 1000]
            }]
            clf = GridSearchCV(SVC(C=1),
                               param_grid=tuned_parameters,
                               cv=5,
                               scoring=score,
                               n_jobs=-1)
        elif strategy == 'random_forest':
            tuned_parameters = [{
                'n_estimators': [10, 30, 50, 70, 90, 110, 130, 150],
                'max_features': ['auto', 'sqrt', 'log2', None]
            }]
            clf = GridSearchCV(RandomForestClassifier(),
                               param_grid=tuned_parameters,
                               cv=3,
                               scoring=score,
                               n_jobs=-1)
        clf.fit(data_train, label_train)
        print clf.best_estimator_

        y_true, y_pred = label_test, clf.predict(data_test)
        out.write(classification_report(y_true, y_pred))

        print 'loop for ' + score + ' has done\n'

    cf.write_exec_time(start_time, out)
    out.close()
    print "classification of " + filename + " with " + strategy + " has done"
    return
コード例 #11
0
			continue
		place_list = [place_full_name, place_name]
		place_id_dict[place_id] = place_list

	if latitude and longtitude:
		out_file_path = "/home/muga/twitter/place_id_data/"
		area_range = accuracy / 1000 #km
		file_name = "placeid_" + latitude + "_" + longtitude + "_" + str(area_range) + "km_"+ ".txt"
	else:
		cityname = cityname.replace(' ','')
		out_file_path = "/home/muga/twitter/place_id_data/" + cityname + "/"
		if granularity:
			file_name = "placeid_" + cityname + "_" + granularity + ".txt"
		else:
			file_name = "placeid_" + cityname + "_keyword_search.txt"

	file_name = file_name.replace(' ', '')
	cf.validate_directory(out_file_path)
	output = open(out_file_path + file_name, 'w')

	output.write('city name: ' + cityname + ', granularity: ' + granularity + ', latitude: ' + latitude + ', longtitude: ' + longtitude + "\n")
	for p_id in place_id_dict:
		#print p + " ----> id:  " + place_dict[p] + "\n"
		s = p_id + ": " + place_id_dict[p_id][0] + ": " + place_id_dict[p_id][1]+ "\n"
		#decode s as UTF-8 string
		output.write(s.encode('utf-8'))
	output.close()

if __name__ == '__main__':
	main()