def classify_tweets(labeled_tweets, testing_tweets, mode="find_irrelevant"): X, y, vectorizer, labeled_tweets_id = generate_training_feature_matrix( labeled_tweets, mode=mode) model = random_forest(X, y) resulting_tweets = [] tweets = [] tweets_txt = [] has_url = [] has_username = [] for tweet in testing_tweets: tweet_id = int(float(tweet['id'])) if (tweet_id in labeled_tweets_id): continue # already classified if (tweet['class']): resulting_tweets.append(tweet) continue text = tweet['text'] has_url.append(common.has_url(text)) has_username.append(common.has_username(text)) text = common.sanitize_text(text) tweets_txt.append(text) tweets.append(tweet) logger.info('already labeled: [%d]; testing: [%d]' % (len(resulting_tweets), len(tweets))) X_test = vectorizer.transform(tweets_txt) X_test = X_test.toarray() X_test = np.column_stack((X_test, has_username, has_url)) y_pred = model.predict(X_test) #logger.info('prediction: [%d]'%(len(y_pred))) for c, tweet in zip(y_pred, tweets): if (mode == 'find_smoker'): tweet['class'] = c elif (mode == 'find_irrelevant'): tweet['class'] = -1 if c == 1 else 0 resulting_tweets.append(tweet) return resulting_tweets
def classify_tweets(labeled_tweets, testing_tweets, mode="find_irrelevant"): X, y, vectorizer,labeled_tweets_id = generate_training_feature_matrix(labeled_tweets, mode=mode) model = random_forest(X, y) resulting_tweets = [] tweets = [] tweets_txt = [] has_url = [] has_username = [] for tweet in testing_tweets: tweet_id = int(float(tweet['id'])) if (tweet_id in labeled_tweets_id): continue # already classified if (tweet['class']): resulting_tweets.append(tweet) continue text = tweet['text'] has_url.append(common.has_url(text)) has_username.append(common.has_username(text)) text = common.sanitize_text(text) tweets_txt.append(text) tweets.append(tweet) logger.info('already labeled: [%d]; testing: [%d]'%(len(resulting_tweets), len(tweets))) X_test = vectorizer.transform(tweets_txt) X_test = X_test.toarray() X_test = np.column_stack((X_test, has_username, has_url)) y_pred = model.predict(X_test) #logger.info('prediction: [%d]'%(len(y_pred))) for c, tweet in zip(y_pred, tweets): if (mode == 'find_smoker'): tweet['class'] = c elif (mode == 'find_irrelevant'): tweet['class'] = -1 if c == 1 else 0 resulting_tweets.append(tweet) return resulting_tweets
def make_dataset(): if isdir("dataset"): rmtree("dataset") mkdir("dataset") for lang in language_names: raw_lang_file_name = path(WIKIDATASET_DIR_RAW, lang + "_raw") with open(raw_lang_file_name, "r") as f: raw_text = f.read() sanitized = sanitize_text(raw_text) with open(path("dataset", lang), "w") as wf: wf.write(sanitized)
def generate_training_feature_matrix(labeled_tweets, mode='find_irrelevant'): # with open(labeled_transgender_identification_csv_filename, 'r', newline='', encoding='utf-8') as rf: # reader = csv.DictReader(rf) training_tweets = [] tweets_id = set() y = [] has_url = [] has_username = [] for tweet in labeled_tweets: tweet_id = int(float(tweet['id'])) c = determine_class(tweet, mode=mode) if (c == None): continue y.append(c) text = tweet['text'] has_url.append(common.has_url(text)) has_username.append(common.has_username(text)) text = common.sanitize_text(text) training_tweets.append(text) tweets_id.add(tweet_id) logger.info('positive: %d; negative: %d' % (y.count(1), y.count(0))) # ngram_range=(1, 2),token_pattern=r'\b\w+\b', min_df=1, , max_features = 5000, binary = False # TfidfVectorizer(min_df=1) vectorizer = TfidfVectorizer( analyzer="word", ngram_range=(1, 2), min_df=1, max_features=5000 ) #CountVectorizer(analyzer = "word", ngram_range=(1, 2), binary = True, min_df=1, tokenizer = None, preprocessor = None, stop_words = None) X = vectorizer.fit_transform(training_tweets) X = X.toarray() X = np.column_stack((X, has_username, has_url)) # vocab = vectorizer.get_feature_names() # # Sum up the counts of each vocabulary word # dist = np.sum(train_data_features, axis=0) # # For each, print the vocabulary word and the number of times it # # appears in the training set # for tag, count in zip(vocab, dist): # print(count, tag) return X, np.array(y), vectorizer, tweets_id
def generate_training_feature_matrix(labeled_tweets, mode ='find_irrelevant'): # with open(labeled_transgender_identification_csv_filename, 'r', newline='', encoding='utf-8') as rf: # reader = csv.DictReader(rf) training_tweets = [] tweets_id = set() y = [] has_url = [] has_username = [] for tweet in labeled_tweets: tweet_id = int(float(tweet['id'])) c = determine_class(tweet, mode=mode) if (c == None): continue y.append(c) text = tweet['text'] has_url.append(common.has_url(text)) has_username.append(common.has_username(text)) text = common.sanitize_text(text) training_tweets.append(text) tweets_id.add(tweet_id) logger.info('positive: %d; negative: %d'%(y.count(1), y.count(0))) # ngram_range=(1, 2),token_pattern=r'\b\w+\b', min_df=1, , max_features = 5000, binary = False # TfidfVectorizer(min_df=1) vectorizer = TfidfVectorizer(analyzer="word", ngram_range=(1, 2), min_df=1, max_features = 5000)#CountVectorizer(analyzer = "word", ngram_range=(1, 2), binary = True, min_df=1, tokenizer = None, preprocessor = None, stop_words = None) X = vectorizer.fit_transform(training_tweets) X = X.toarray() X = np.column_stack((X, has_username, has_url)) # vocab = vectorizer.get_feature_names() # # Sum up the counts of each vocabulary word # dist = np.sum(train_data_features, axis=0) # # For each, print the vocabulary word and the number of times it # # appears in the training set # for tag, count in zip(vocab, dist): # print(count, tag) return X, np.array(y), vectorizer, tweets_id
def transform_json_to_txt(datafolder, txtfile): for root, dirs, files in os.walk(os.path.abspath(datafolder)): for f in files: if (f == 'search.json'): continue with open(os.path.join(root, f), 'r', newline='', encoding='utf-8') as json_f, open(txtfile, 'a+', newline='', encoding='utf-8') as wf: logger.info('processing: [%s]'%os.path.join(root, f)) for line in json_f: #logger.info(line) try: tweet = json.loads(line) wf.write('%s '%common.sanitize_text(tweet['text'])) except Exception as exc: #logger.warn(exc) pass
def sanitize_file(filename): with open(filename) as f: text = f.read() with open(filename, 'wb') as f: f.write(sanitize_text(text))
def filter_raw(raw_json_data_folder): tweets = [] # texts = set() tweetIds = set() cnt_total_no_duplicate =0 geocoded_cnt = 0 lang_cnt = 0 place_cnt = 0 gps_cnt = 0 duplicated_cnt = 0 sanitized_cnt = 0 total_cnt = 0 for root, dirs, files in os.walk(os.path.abspath(raw_json_data_folder)): for f in files: if (f != 'search.json' and not f.startswith('.')): logger.info(os.path.join(root, f)) with open(os.path.join(root, f), 'r') as json_file: for line in json_file: try: if (line.startswith('{')): tweet = json.loads(line) total_cnt += 1 if (int(tweet['id']) in tweetIds): duplicated_cnt += 1 continue else: tweetIds.add(int(tweet['id'])) cnt_total_no_duplicate += 1 if ('lang' in tweet and 'en' != tweet['lang']): continue if ('lang' not in tweet): lang, prob = langid.classify(text) if (lang != 'en'): continue lang_cnt += 1 sanitized_text = common.sanitize_text(tweet['text']).strip() if sanitized_text == '' or sanitized_text == 'RT': continue sanitized_cnt += 1 geolocation = tweet['user']['location'] if 'place' in tweet and tweet['place']: if (tweet['place']['country_code'] != 'US'): continue else: geolocation = tweet['place']['full_name'] us_state = tug.get_state(geolocation) if (not us_state): continue geocoded_cnt += 1 tweets.append({ 'id': tweet['id'], 'text':tweet['text'], 'clean_text': sanitized_text, 'place': tweet['place'] if 'place' in tweet else '', 'user_location': tweet['user']['location'], 'us_state': us_state, 'created_at': tweet['created_at'], 'username': tweet['user']['name'], 'user_id': tweet['user']['id'], 'is_quote_status': tweet['is_quote_status'] }) except Exception as exc: logger.info(line) logger.warn('ignore: %s'%(exc)) logger.info('total: %d; duplicate: %d; no_duplicate: %d; en: %d; sanitize_text: %d; geo: %d'%(total_cnt, duplicated_cnt, cnt_total_no_duplicate, lang_cnt, sanitized_cnt, geocoded_cnt)) return tweets