def omd_hcr(in_path): data = [] with open(in_path) as fid: soup = BeautifulSoup(fid.read(), "xml") for item in soup.findAll('item'): if item.attrs['label'] in ['positive', 'negative', 'neutral']: msg = item.find("content").text msg = preprocess(msg.decode("utf-8")) data.append([item.attrs['label'], msg]) return data
def semeval(in_path): data = [] with codecs.open(in_path, "r", "utf-8") as fid: for l in fid: spt = l.replace("\n", "").split("\t") label = spt[0].replace("\"", "") tweet = spt[1] #.decode("utf-8") if label in ['positive', 'negative', 'neutral']: tweet = preprocess(tweet) #.encode("utf-8") ex = (label, tweet) data.append(ex) return data
def stance(in_path, topic): data = [] with open(in_path) as fid: for l in fid: spt = l.replace("\r\n", "").split("\t") current_topic, tweet, label = spt[1:] if current_topic == topic: if label in ['FAVOR', 'AGAINST', 'NONE']: tweet = preprocess(tweet.decode("utf-8")) ex = [label.lower(), tweet.encode("utf-8")] data.append(ex) return data
def casm(in_path): cache = dict() data = [] labs = [] with codecs.open(in_path, "r", "utf-8") as fid: for l in fid: spt = l.split("\t") label = spt[0].split(",")[1] tweet = preprocess(spt[1]) if tweet in cache: continue cache[tweet] = True # print label, tweet ex = [label, tweet] labs.append(label) data.append(ex) print "labels: ", list(set(labs)) return data
if not os.path.exists(output): os.makedirs(output) #### --- TRAIN DATA ---- #### tweets_by_user = {} print "[reading user tweets]" z = 0 MAX_USERS = 100 MAX_USERS = float('inf') MIN_TWEETS = 100 for fname in os.listdir(path_train): if os.path.splitext(path_train + fname)[1] != ".gz": print "ignored %s" % fname continue with gzip.open(path_train + fname, 'r') as f: user = fname[:fname.index(".")] data = [preprocess(json.loads(l)['text']) for l in f] # data = set([json.loads(l)['text'] for l in f]) if len(data) < MIN_TWEETS: print "ignored user %s | %d tweets" % (user, len(data)) continue tweets_by_user[user] = set(data) z += 1 sys.stdout.write("\ruser: "******" (" + str(z) + ")" + " " * 20) sys.stdout.flush() if z >= MAX_USERS: print "out early!!!!" break print "[writing user tweets]" user_corpus = codecs.open(output + "mental_health_tweets", "w", "utf-8") for user, twt in tweets_by_user.items():