def createFV(in_file_path, selected_n_grams, tdm_dict, tid_dict, sc_dict): with open(in_file_path, 'rb') as tsv_in: # Initializing the TDM (list of dictionaries) for tid in tid_dict: tdm_dict[tid] = {} for topic in selected_n_grams: for token in selected_n_grams[topic]: tdm_dict[tid][token] = 0 #create csv reader for the input file # TweetID | Topic | Tweet | class (sentiment) tsv_in = csv.reader(tsv_in, delimiter='\t') for row in tsv_in: tid = row[0] # topic = row[1] tweet = row[2] if len(row)>3: words = dataCleaner.removeStopWords(dataCleaner.removeSpecialCharacters(tweet),1) #topic = dataCleaner.removeSpecialCharacters(topic) #words = dataCleaner.removeStopWords(dataCleaner.removeSpecialCharacters(tweet), 1) if tid in tid_dict: for topic in selected_n_grams: for token in selected_n_grams[topic]: if token in words: tdm_dict[tid][token]+=1
def dataReader(in_file_path, n_grams, tid_dict, sc_dict): with open(in_file_path, 'rb') as tsv_in: #create csv reader for the input file tsv_in = csv.reader(tsv_in, delimiter='\t') # File format # TweetID | Tweet | class (sentiment) for row in tsv_in: #print row tid = row[0] tweet = row[1] if len(row) > 2: sentiment = row[2] # only consider unique tweets if not tid in tid_dict: if sentiment in sc_dict: tid_dict[tid] = sentiment sc_dict[sentiment] += 1 # obtaining individual words from the tweet # Special characters, numbers are removed and converted to lower case # first step, removing special characters other than ones related to smileys words = dataCleaner.removeStopWords( dataCleaner.removeSpecialCharacters( dataCleaner.convertEmoticons(tweet)), 1) for token in words: if token in n_grams: n_grams[token] += 1 else: n_grams[token] = 1
def dataReader(in_file_path, n_grams, tid_dict, sc_dict): with open(in_file_path, 'rb') as tsv_in: #create csv reader for the input file tsv_in = csv.reader(tsv_in, delimiter='\t') # File format # TweetID | Tweet | class (sentiment) for row in tsv_in: #print row tid = row[0] tweet = row[1] if len(row)>2: sentiment = row[2] # only consider unique tweets if not tid in tid_dict: if sentiment in sc_dict: tid_dict[tid] = sentiment sc_dict[sentiment] += 1 # obtaining individual words from the tweet # Special characters, numbers are removed and converted to lower case # first step, removing special characters other than ones related to smileys words = dataCleaner.removeStopWords(dataCleaner.removeSpecialCharacters(dataCleaner.convertEmoticons(tweet)),1) for token in words: if token in n_grams: n_grams[token] +=1 else: n_grams[token] = 1
def createFV(in_file_path, selected_n_grams, tdm_dict, tid_dict, sc_dict): with open(in_file_path, 'rb') as tsv_in: # Initializing the TDM (list of dictionaries) for tid in tid_dict: tdm_dict[tid] = selected_n_grams tdm_dict[tid] = {x:0 for (x,v) in tdm_dict[tid].items()} #create csv reader for the input file tsv_in = csv.reader(tsv_in, delimiter='\t') for row in tsv_in: tid = row[0] tweet = row[1] if len(row)>2: #words = dataCleaner.removeStopWords(dataCleaner.removeSpecialCharacters(dataCleaner.convertEmoticons(tweet)),1) words = dataCleaner.removeStopWords(dataCleaner.removeSpecialCharacters(tweet), 1) if tid in tid_dict: for token in selected_n_grams: if token in words: tdm_dict[tid][token]+=1
def createFV(in_file_path, selected_n_grams, tdm_dict, tid_dict, sc_dict): with open(in_file_path, 'rb') as tsv_in: # Initializing the TDM (list of dictionaries) for tid in tid_dict: tdm_dict[tid] = selected_n_grams tdm_dict[tid] = {x: 0 for (x, v) in tdm_dict[tid].items()} #create csv reader for the input file tsv_in = csv.reader(tsv_in, delimiter='\t') for row in tsv_in: tid = row[0] tweet = row[1] if len(row) > 2: #words = dataCleaner.removeStopWords(dataCleaner.removeSpecialCharacters(dataCleaner.convertEmoticons(tweet)),1) words = dataCleaner.removeStopWords( dataCleaner.removeSpecialCharacters(tweet), 1) if tid in tid_dict: for token in selected_n_grams: if token in words: tdm_dict[tid][token] += 1