def classifyEmoticons(text): # find all emoticons emoticons = EMOTICON_RE.findall(text) pos = any([emo in POS_EMOTICONS for emo in emoticons]) or bool(POS_EMOJIS_RE.search(text)) neg = any([emo in NEG_EMOTICONS for emo in emoticons]) or bool(NEG_EMOJIS_RE.search(text)) if pos and neg: return 'N/A' elif pos and not neg: return 'pos' elif neg and not pos: return 'neg' elif not pos and not neg: return None
def emoticonList(corpus): emolist = [] emocount = [] emoTorF = [] for a in corpus: ct = [] # store count TorF = [] # possive or negative all_emoticons = EMOTICON_RE.findall(a) ct.append(len(all_emoticons)) if (len(all_emoticons) != 0): ed = EmoticonDetector() all_emoticons.sort() tf = (-1, 1)[ed.is_positive(all_emoticons[0])] TorF.append(tf) else: TorF.append(0) emolist.append(all_emoticons) emocount.append(ct) emoTorF.append(TorF) return emocount, emoTorF, emolist
def json2csv_preprocess(json_file, outfile, fields, encoding='utf8', errors='replace', gzip_compress=False, skip_retweets=True, skip_tongue_tweets=True, skip_ambiguous_tweets=True, strip_off_emoticons=True, remove_duplicates=True, limit=None): """ Convert json file to csv file, preprocessing each row to obtain a suitable dataset for tweets Semantic Analysis. :param json_file: the original json file containing tweets. :param outfile: the output csv filename. :param fields: a list of fields that will be extracted from the json file and kept in the output csv file. :param encoding: the encoding of the files. :param errors: the error handling strategy for the output writer. :param gzip_compress: if True, create a compressed GZIP file. :param skip_retweets: if True, remove retweets. :param skip_tongue_tweets: if True, remove tweets containing ":P" and ":-P" emoticons. :param skip_ambiguous_tweets: if True, remove tweets containing both happy and sad emoticons. :param strip_off_emoticons: if True, strip off emoticons from all tweets. :param remove_duplicates: if True, remove tweets appearing more than once. :param limit: an integer to set the number of tweets to convert. After the limit is reached the conversion will stop. It can be useful to create subsets of the original tweets json data. """ with codecs.open(json_file, encoding=encoding) as fp: (writer, outf) = outf_writer_compat(outfile, encoding, errors, gzip_compress) # write the list of fields as header writer.writerow(fields) if remove_duplicates == True: tweets_cache = [] i = 0 for line in fp: tweet = json.loads(line) row = extract_fields(tweet, fields) try: text = row[fields.index('text')] # Remove retweets if skip_retweets == True: if re.search(r'\bRT\b', text): continue # Remove tweets containing ":P" and ":-P" emoticons if skip_tongue_tweets == True: if re.search(r'\:\-?P\b', text): continue # Remove tweets containing both happy and sad emoticons if skip_ambiguous_tweets == True: all_emoticons = EMOTICON_RE.findall(text) if all_emoticons: if (set(all_emoticons) & HAPPY) and (set(all_emoticons) & SAD): continue # Strip off emoticons from all tweets if strip_off_emoticons == True: row[fields.index('text')] = re.sub( r'(?!\n)\s+', ' ', EMOTICON_RE.sub('', text)) # Remove duplicate tweets if remove_duplicates == True: if row[fields.index('text')] in tweets_cache: continue else: tweets_cache.append(row[fields.index('text')]) except ValueError: pass writer.writerow(row) i += 1 if limit and i >= limit: break outf.close()
def json2csv_preprocess(json_file, outfile, fields, encoding='utf8', errors='replace', gzip_compress=False, skip_retweets=True, skip_tongue_tweets=True, skip_ambiguous_tweets=True, strip_off_emoticons=True, remove_duplicates=True, limit=None): """ Convert json file to csv file, preprocessing each row to obtain a suitable dataset for tweets Semantic Analysis. :param json_file: the original json file containing tweets. :param outfile: the output csv filename. :param fields: a list of fields that will be extracted from the json file and kept in the output csv file. :param encoding: the encoding of the files. :param errors: the error handling strategy for the output writer. :param gzip_compress: if True, create a compressed GZIP file. :param skip_retweets: if True, remove retweets. :param skip_tongue_tweets: if True, remove tweets containing ":P" and ":-P" emoticons. :param skip_ambiguous_tweets: if True, remove tweets containing both happy and sad emoticons. :param strip_off_emoticons: if True, strip off emoticons from all tweets. :param remove_duplicates: if True, remove tweets appearing more than once. :param limit: an integer to set the number of tweets to convert. After the limit is reached the conversion will stop. It can be useful to create subsets of the original tweets json data. """ with codecs.open(json_file, encoding=encoding) as fp: (writer, outf) = outf_writer_compat(outfile, encoding, errors, gzip_compress) # write the list of fields as header writer.writerow(fields) if remove_duplicates == True: tweets_cache = [] i = 0 for line in fp: tweet = json.loads(line) row = extract_fields(tweet, fields) try: text = row[fields.index('text')] # Remove retweets if skip_retweets == True: if re.search(r'\bRT\b', text): continue # Remove tweets containing ":P" and ":-P" emoticons if skip_tongue_tweets == True: if re.search(r'\:\-?P\b', text): continue # Remove tweets containing both happy and sad emoticons if skip_ambiguous_tweets == True: all_emoticons = EMOTICON_RE.findall(text) if all_emoticons: if (set(all_emoticons) & HAPPY) and (set(all_emoticons) & SAD): continue # Strip off emoticons from all tweets if strip_off_emoticons == True: row[fields.index('text')] = re.sub(r'(?!\n)\s+', ' ', EMOTICON_RE.sub('', text)) # Remove duplicate tweets if remove_duplicates == True: if row[fields.index('text')] in tweets_cache: continue else: tweets_cache.append(row[fields.index('text')]) except ValueError: pass writer.writerow(row) i += 1 if limit and i >= limit: break outf.close()
i = 0 url_re = re.compile(URLS, re.VERBOSE | re.I | re.UNICODE) hashtag_re = re.compile('(?:^|\s)[##]{1}(\w+)', re.UNICODE) #mention_re = re.compile('(?:^|\s)[@@]{1}([^\s#<>[\]|{}]+)', re.UNICODE) # To include more complete names mention_re = re.compile('(?:^|\s)[@@]{1}(\w+)', re.UNICODE) with open(text_file, 'r') as text_reader, open( words_file, 'w', encoding='utf-8') as words_writer, open( emo_file, 'w', encoding='utf-8') as emo_writer, open( hash_file, 'w', encoding='utf-8') as hash_writer, open( at_file, 'w', encoding='utf-8') as at_writer, open( link_file, 'w', encoding='utf-8') as link_writer: for line in text_reader: line = line.rstrip().lower() hashs = hashtag_re.findall(line) ats = mention_re.findall(line) links = url_re.findall(line) line = clean(line, hashs, ats, links) emoticons = emo_re.findall(line) emojis = [w for w in line if w in emoji.UNICODE_EMOJI] words = re.findall('[a-záéíóúñ][a-záéíóúñ_-]+', line) #Revisar para remover ats, hashs y links words_writer.write(' '.join(w for w in words) + '\n') emo_writer.write(' '.join(w for w in emoticons + emojis) + '\n') hash_writer.write(' '.join(w for w in hashs) + '\n') at_writer.write(' '.join(w for w in ats) + '\n') link_writer.write(' '.join(w for w in links) + '\n') i += 1 print(i)