def main(input_file_path, output_dir_path, main_task, protect_att): """ Runs data processing scripts to turn raw data from (../raw) into cleaned data ready to be analyzed (saved in ../processed). """ logger = logging.getLogger(__name__) logger.info('making final data set from raw data') df = get_data(input_file_path) logger.info('read all twits and removed duplicates') if main_task == 'sentiment': if protect_att == 'race': logger.info('making sentiment-race') pos_pos = get_attr_sentiments(df, happy, sad, 'aa', MIN_SENTENCE_LEN) pos_neg = get_attr_sentiments(df, happy, sad, 'wh', MIN_SENTENCE_LEN) neg_pos = get_attr_sentiments(df, sad, happy, 'aa', MIN_SENTENCE_LEN) neg_neg = get_attr_sentiments(df, sad, happy, 'wh', MIN_SENTENCE_LEN) else: logger.error('not supporting this task...') exit(-1) elif main_task == 'mention': if protect_att == 'race': logger.info('making mention-race') wh, aa = get_race(df, MIN_SENTENCE_LEN) pos_pos, neg_pos = mention_split(aa, MIN_SENTENCE_LEN) pos_neg, neg_neg = mention_split(wh, MIN_SENTENCE_LEN) else: logger.error('not supporting this task...') exit(-1) else: logger.error('not supporting this task...') exit(-1) logger.info('done collecting data') size = 100000 sentences = pos_pos[:size] + pos_neg[:size] + neg_pos[:size] + neg_neg[: size] vocab = list(set([item for sublist in sentences for item in sublist])) id2voc = dict(enumerate(vocab)) voc2id = {v: k for k, v in id2voc.iteritems()} to_file(output_dir_path, voc2id, vocab, pos_pos[:size], pos_neg[:size], neg_pos[:size], neg_neg[:size]) logger.info('written to file. exiting.')
train_pos_f, train_neg_f = mention_split(females[:92000], min_len=MIN_SENTENCE_LEN) test_pos_f, test_neg_f = mention_split(females[94000:], min_len=MIN_SENTENCE_LEN) train_pos_f = shuffle(train_pos_f, random_state=SEED) train_neg_f = shuffle(train_neg_f, random_state=SEED) train_size = 40000 sentences = train_pos_m + train_pos_f + train_neg_m + train_neg_f + test_pos_m + test_pos_f + test_neg_m + test_neg_f vocab = list(set([item for sublist in sentences for item in sublist])) id2voc = dict(enumerate(vocab)) voc2id = {v: k for k, v in id2voc.iteritems()} to_file(project + 'data/processed/author_mention_gender/', voc2id, vocab, train_pos_m[:train_size] + test_pos_m, train_pos_f[:train_size] + test_pos_f, train_neg_m[:train_size] + test_neg_m, train_neg_f[:train_size] + test_neg_f) young, y_ids = tokenize(df[(df['age'] == 0) | (df['age'] == 1)], MIN_SENTENCE_LEN) _, young = zip(*sorted(zip(y_ids, young))) old, o_ids = tokenize( df[(df['age'] == 2) | (df['age'] == 3) | (df['age'] == 4)], MIN_SENTENCE_LEN) _, old = zip(*sorted(zip(o_ids, old))) train_pos_y, train_neg_y = mention_split(young[6500:], min_len=MIN_SENTENCE_LEN) test_pos_y, test_neg_y = mention_split(young[:6000], min_len=MIN_SENTENCE_LEN)
t = normalize_text(aa.iloc[ind].text) if len(t) < min_len: continue if len(set(t)) == 1 and t[0] == MENTION: continue if not all(x in vocab_d for x in t): continue try: s = ' '.join([w for w in t]) if s in prev_sent: continue except: continue aa_data.append(t) except: pass if len(aa_data) >= 100000: break print 'reached 100k after {0} tweets'.format(ind) return wh_data, aa_data wh, aa = get_race(cleaned, 3) id2voc = dict(enumerate(vocab)) voc2id = {v: k for k, v in id2voc.iteritems()} pos_pos, neg_pos = aa[:50000], aa[50000:] pos_neg, neg_neg = wh[:50000], wh[50000:] to_file(project + '/data/processed/unseen_race/', voc2id, vocab, pos_pos, pos_neg, neg_pos, neg_neg)