def load_data(csv_file, text_fields = []): data = pd.read_csv(csv_file, encoding='utf-8') documents = [] for i, r in data.iterrows(): document = '' for text_field in text_fields: #logger.info(pd.isnull(r[text_field])) if(pd.notnull(r[text_field])): document = '%s %s'%(document, common.cleanhtml(common.remove_hashtag_sign(common.remove_username(common.remove_url(ftfy.fix_text(r[text_field])))))) # document = '%s %s'%(document, r[text_field]) documents.append(document) logger.info("# of documents: %d"%len(documents)) stoplist = load_stoplist() # logging.info(stoplist) # quit() wordnet_lemmatizer = WordNetLemmatizer() texts = [[wordnet_lemmatizer.lemmatize(word.lower()) for word in nltk.regexp_tokenize(document, pattern) if wordnet_lemmatizer.lemmatize(word.lower()) not in stoplist] for document in documents] # # quit() texts = filter_by_frequency(texts) dictionary = corpora.Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] # logger.info(corpus[0]) return dictionary, corpus
def extract_text_for_BTM_topic_distribution(source, output_file): tweets = [] stoplist = load_stoplist() wordnet_lemmatizer = WordNetLemmatizer() df = pd.read_csv(source, encoding='utf-8') for index, row in df.iterrows(): tweet = {} clean_text = common.cleanhtml( common.remove_hashtag_sign( common.remove_username( common.remove_url(ftfy.fix_text(row['text']))))) preprocessed_text = '' temp = [ wordnet_lemmatizer.lemmatize(word.lower()) for word in nltk.regexp_tokenize(clean_text, pattern) if wordnet_lemmatizer.lemmatize(word.lower()) not in stoplist ] if len(temp) == 0: continue for word in temp: preprocessed_text += word + ' ' date = datetime.strptime( row['created_at'], '%a %b %d %H:%M:%S +0000 %Y').replace(tzinfo=pytz.UTC) y_m = str(date.year) + '-' + (str(date.month) if (len(str(date.month)) == 2) else ('0' + str(date.month))) tweet['clean_text'] = clean_text tweet['us_state'] = row['us_state'] tweet['preprocessed_text'] = preprocessed_text tweet['date'] = y_m tweets.append(tweet) logger.info(len(tweets)) to_csv(tweets, output_file)
def extract_hashtag(source): hashtag = {} cnt = 0 df = pd.read_csv(source, encoding='utf-8') for index, row in df.iterrows(): if '#' in row['text']: text = common.cleanhtml( common.remove_username( common.remove_url(ftfy.fix_text(row['text'])))) hastags = re.findall(r"#(\w+)", text) if len(hastags) != 0: for tag in hastags: if tag.lower() not in hashtag: hashtag[tag.lower()] = 1 else: hashtag[tag.lower()] += 1 print(hashtag) with open('./intermediate_data/hastags.json', 'w') as outfile: json.dump(hashtag, outfile)
def extract_tweet_not_by_uid(source): tweets = [] BTM_input = [] wordnet_lemmatizer = WordNetLemmatizer() df = pd.read_csv(source, encoding='utf-8') for index, row in df.iterrows(): clean_text = common.cleanhtml( common.remove_hashtag_sign( common.remove_username( common.remove_url(ftfy.fix_text(row['text']))))) preprocessed_text = '' temp = [ wordnet_lemmatizer.lemmatize(word.lower()) for word in nltk.regexp_tokenize(clean_text, pattern) ] if len(temp) == 0: continue for word in temp: preprocessed_text += word + ' ' tweets.append(row['text']) BTM_input.append(preprocessed_text) with open('./intermediate_data/hpv_tweets/hpv_tweets_not_by_uid.csv', 'a', newline='', encoding='utf-8') as csv_f: writer = csv.DictWriter(csv_f, fieldnames=['tweets'], delimiter=',', quoting=csv.QUOTE_ALL) writer.writeheader() for tweet in tweets: writer.writerow({'tweets': tweet}) with open( './intermediate_data/hpv_tweets/hpv_tweets_not_by_uid_BTM_input.txt', 'w', encoding='utf-8') as outfile: for tweet in BTM_input: outfile.write(tweet + '\n')
def extract_clean_text(json_file): wv = [] cnt = 0 stoplist = load_stoplist() wordnet_lemmatizer = WordNetLemmatizer() with open(json_file, 'r') as json_file: user_tweets = json.load(json_file) for user in user_tweets: text = '' for tweet in user_tweets[user]: text += common.cleanhtml( common.remove_hashtag_sign( common.remove_username( common.remove_url(ftfy.fix_text(tweet))))) + ' ' # clean_texts = [wordnet_lemmatizer.lemmatize(word.lower()) for word in nltk.regexp_tokenize(text, pattern)] clean_texts = [ wordnet_lemmatizer.lemmatize(word.lower()) for word in nltk.regexp_tokenize(text, pattern) if wordnet_lemmatizer.lemmatize(word.lower()) not in stoplist ] wv.append(clean_texts) cnt += 1 logger.info('total tweets: %d;' % cnt) return wv
def generate_corpus_for_quality_evaluation(k, pz_d, tweets, topic_words_distribution): all_tweets = [] logger.info(k) df = pd.read_csv(tweets, encoding='utf-8') for index, row in df.iterrows(): all_tweets.append(row['tweets']) tweets_pz_d = [] with open(pz_d) as f: for l in f: line = l.strip().split(' ') tweets_pz_d.append([float(p) for p in line]) results = {} for j in range(len(tweets_pz_d)): if 'nan' not in tweets_pz_d[j] and '-nan' not in tweets_pz_d[j]: sorted_pz_ds = list(tweets_pz_d[j]) sorted_pz_ds.sort(reverse=True) topic_id = tweets_pz_d[j].index(sorted_pz_ds[0]) if topic_id not in results: results[topic_id] = [all_tweets[j]] else: results[topic_id].append(all_tweets[j]) final_result = [] for tp in results: for keyword in topic_words_distribution[tp][1]: temp = [] dedup = set() for tweet in results[tp]: if '%s' % keyword[0] in tweet.lower(): clean_text_list = (common.cleanhtml( common.remove_username( common.remove_url(ftfy.fix_text( tweet.lower()))))).strip(' ').replace( '\n', ' ').split(' ')[:-1] clean_text = ",".join(str(x) for x in clean_text_list) if clean_text not in dedup: temp.append(tweet) dedup.add(clean_text) # samples_number = random.sample(range(1, len(temp)), 1) # if (tp == 6) and (keyword[0] == 'u.s.'): # logger.info(temp) # quit() samples_number = [] if len(temp) <= 2: samples_number = range(len(temp)) else: samples_number = random.sample(range(1, len(temp)), 2) for i in samples_number: result = {} result['topic_id'] = tp result['keyword'] = keyword[0] result['propability'] = keyword[1] result['tweet'] = temp[i] final_result.append(result) to_csv( final_result, '../../papers/2017_BMC_HPV/analysis/BTM/quality_evaluation/' + str(k) + 'tp.csv')
def group_tweets_by_cluster_gold_standard(source, k): tweets = [] all_tweets_in_cluster = [] wordnet_lemmatizer = WordNetLemmatizer() # read all tweets df = pd.read_csv(source, encoding='utf-8') for index, row in df.iterrows(): tweets.append(row['text']) hashtags = [] with open('./intermediate_data/cluster_hashtags.json', 'r') as json_file: hashtags = json.load(json_file) for i in range(k): # with open('./BTM/output/' + str(k) + 'tp/clusters/' + str(i) + 'tp.txt', 'w') as clusters: with open('./intermediate_data/LDA_BTM_comparison/clusters/' + str(i) + 'tp.txt', 'w', encoding="utf-8") as clusters: print(i) for tweet in tweets: tags = re.findall(r"#(\w+)", tweet) if len(tags) != 0: for tag in tags: if tag.lower() in hashtags[i]: text = common.cleanhtml( common.remove_hashtag_sign( common.remove_username( common.remove_url( ftfy.fix_text(tweet))))) clean_texts = [ wordnet_lemmatizer.lemmatize(word.lower()) for word in nltk.regexp_tokenize( text, pattern) ] final_text = '' for word in clean_texts: final_text += word + ' ' all_tweets_in_cluster.append(final_text) # final_text = re.sub(r"[\u4e00-\u9fff]", "", final_text) clusters.write(final_text) clusters.write('\n') break # txt for BTM with open( './intermediate_data/LDA_BTM_comparison/lda_BTM_comparison_traning_data.txt', 'w', encoding="utf-8") as file: for tweet in all_tweets_in_cluster: file.write(tweet) file.write('\n') # csv for LDA fieldnames = ['clean_text'] with open( './intermediate_data/LDA_BTM_comparison/lda_BTM_comparison_traning_data.csv', 'w', newline='', encoding='utf-8') as csv_f: writer = csv.DictWriter(csv_f, fieldnames=fieldnames, delimiter=',', quoting=csv.QUOTE_ALL) writer.writeheader() for tweet in all_tweets_in_cluster: writer.writerow({'clean_text': tweet})