def parsedata(lines, word_list, split_word_list, emoji_dict, abbreviation_dict, normalize_text=False, split_hashtag=False, ignore_profiles=False, lowercase=False, replace_emoji=True, n_grams=None, at_character=False): data = [] for i, line in enumerate(lines): if (i % 100 == 0): print(str(i) + '...', end='', flush=True) try: # convert the line to lowercase if (lowercase): line = line.lower() # split into token token = line.split('\t') # ID id = token[0] # label label = int(token[1].strip()) # tweet text target_text = TweetTokenizer().tokenize(token[2].strip()) if (at_character): target_text = [c for c in token[2].strip()] if (n_grams != None): n_grams_list = list( create_ngram_set(target_text, ngram_value=n_grams)) target_text.extend(['_'.join(n) for n in n_grams_list]) # filter text target_text = filter_text(target_text, word_list, split_word_list, emoji_dict, abbreviation_dict, normalize_text, split_hashtag, ignore_profiles, replace_emoji=replace_emoji) # awc dimensions dimensions = [] if (len(token) > 3 and token[3].strip() != 'NA'): dimensions = [ dimension.split('@@')[1] for dimension in token[3].strip().split('|') ] # context tweet context = [] if (len(token) > 4): if (token[4] != 'NA'): context = TweetTokenizer().tokenize(token[4].strip()) context = filter_text(context, word_list, split_word_list, emoji_dict, abbreviation_dict, normalize_text, split_hashtag, ignore_profiles, replace_emoji=replace_emoji) # author author = 'NA' if (len(token) > 5): author = token[5] if (len(target_text) != 0): # print((label, target_text, dimensions, context, author)) data.append( (id, label, target_text, dimensions, context, author)) except: raise print('') return data
def parse_sent(sent, word_file_path, split_word_path, emoji_file_path, vocab, normalize_text=False, split_hashtag=False, ignore_profiles=False, lowercase=False, replace_emoji=True, n_grams=None, at_character=False): target_text = TweetTokenizer().tokenize(sent.strip()) if (at_character): target_text = [c for c in sent.strip()] if (n_grams != None): n_grams_list = list(create_ngram_set(target_text, ngram_value=n_grams)) target_text.extend(['_'.join(n) for n in n_grams_list]) word_list = None emoji_dict = None # load split files split_word_list = load_split_word(split_word_path) # load word dictionary if (split_hashtag): word_list = InitializeWords(word_file_path) if (replace_emoji): emoji_dict = load_unicode_mapping(emoji_file_path) abbreviation_dict = load_abbreviation() # filter text target_text = filter_text(target_text, word_list, split_word_list, emoji_dict, abbreviation_dict, normalize_text, split_hashtag, ignore_profiles, replace_emoji=replace_emoji) known_words_set = set() unknown_words_set = set() tokens = 0 token_coverage = 0 vec = [] # tweet for words in target_text: tokens = tokens + 1 if words in vocab: vec.append(vocab[words]) token_coverage = token_coverage + 1 known_words_set.add(words) else: vec.append(vocab['unk']) unknown_words_set.add(words) return numpy.asarray([vec])
def parsedata(dataset, word_list, split_word_list, emoji_dict, abbreviation_dict, normalize_text=False, split_hashtag=False, ignore_profiles=False, lowercase=False, replace_emoji=True, n_grams=None, at_character=False): data = [] columns = list(dataset.columns) for i, tuple in enumerate(dataset.itertuples()): # for i, line in enumerate(lines): if (i % 100 == 0): print(str(i) + '...', end='', flush=True) try: # split into token # token = line.split('\t') # print(token) # ID # id = token[0] id = tuple.ID # label # label = int(token[1].strip()) label = tuple.label target_text = tuple.text # convert the line to lowercase if (lowercase): target_text = target_text.lower() # tweet text # target_text = TweetTokenizer().tokenize(token[2].strip()) target_text = TweetTokenizer().tokenize(target_text.strip()) if (n_grams != None): n_grams_list = list(create_ngram_set(target_text, ngram_value=n_grams)) target_text.extend(['_'.join(n) for n in n_grams_list]) # filter text target_text = filter_text(target_text, word_list, split_word_list, emoji_dict, abbreviation_dict, normalize_text, split_hashtag, ignore_profiles, replace_emoji=replace_emoji) # split at character if (at_character): target_text = [c for c in ' '.join(target_text)] # awc dimensions dimensions = [] # if (len(token) > 3 and token[3].strip() != 'NA'): # dimensions = [dimension.split('@@')[1] for dimension in token[3].strip().split('|')] if ('psychological_dimension' in columns and tuple.psychological_dimension.strip() != 'NA'): dimensions = [dimension.split('@@')[1] for dimension in tuple.psychological_dimension.strip().split('|')] # context tweet context = [] # if (len(token) > 4): # if (token[4] != 'NA'): # context = TweetTokenizer().tokenize(token[4].strip()) # context = filter_text(context, word_list, split_word_list, emoji_dict, abbreviation_dict, # normalize_text, # split_hashtag, # ignore_profiles, replace_emoji=replace_emoji) if ('context' in columns): if (tuple.context != 'NA'): context = tuple.context.strip() # convert the line to lowercase if (lowercase): context = context.lower() context = TweetTokenizer().tokenize(context) context = filter_text(context, word_list, split_word_list, emoji_dict, abbreviation_dict, normalize_text, split_hashtag, ignore_profiles, replace_emoji=replace_emoji) # author author = 'NA' # if (len(token) > 5): # author = token[5] if ('author' in columns): author = tuple.author if (len(target_text) != 0): # print((label, target_text, dimensions, context, author)) data.append((id, label, target_text, dimensions, context, author)) else: print(tuple) except: raise print('') return data