Esempio n. 1
0
def main(input_file_path, output_dir_path, main_task, protect_att):
    """ Runs data processing scripts to turn raw data from (../raw) into
        cleaned data ready to be analyzed (saved in ../processed).
    """
    logger = logging.getLogger(__name__)
    logger.info('making final data set from raw data')
    df = get_data(input_file_path)

    logger.info('read all twits and removed duplicates')

    if main_task == 'sentiment':
        if protect_att == 'race':
            logger.info('making sentiment-race')
            pos_pos = get_attr_sentiments(df, happy, sad, 'aa',
                                          MIN_SENTENCE_LEN)
            pos_neg = get_attr_sentiments(df, happy, sad, 'wh',
                                          MIN_SENTENCE_LEN)
            neg_pos = get_attr_sentiments(df, sad, happy, 'aa',
                                          MIN_SENTENCE_LEN)
            neg_neg = get_attr_sentiments(df, sad, happy, 'wh',
                                          MIN_SENTENCE_LEN)
        else:
            logger.error('not supporting this task...')
            exit(-1)
    elif main_task == 'mention':
        if protect_att == 'race':
            logger.info('making mention-race')
            wh, aa = get_race(df, MIN_SENTENCE_LEN)
            pos_pos, neg_pos = mention_split(aa, MIN_SENTENCE_LEN)
            pos_neg, neg_neg = mention_split(wh, MIN_SENTENCE_LEN)
        else:
            logger.error('not supporting this task...')
            exit(-1)
    else:
        logger.error('not supporting this task...')
        exit(-1)

    logger.info('done collecting data')

    size = 100000
    sentences = pos_pos[:size] + pos_neg[:size] + neg_pos[:size] + neg_neg[:
                                                                           size]
    vocab = list(set([item for sublist in sentences for item in sublist]))
    id2voc = dict(enumerate(vocab))
    voc2id = {v: k for k, v in id2voc.iteritems()}

    to_file(output_dir_path, voc2id, vocab, pos_pos[:size], pos_neg[:size],
            neg_pos[:size], neg_neg[:size])
    logger.info('written to file. exiting.')
Esempio n. 2
0
train_pos_f, train_neg_f = mention_split(females[:92000],
                                         min_len=MIN_SENTENCE_LEN)
test_pos_f, test_neg_f = mention_split(females[94000:],
                                       min_len=MIN_SENTENCE_LEN)
train_pos_f = shuffle(train_pos_f, random_state=SEED)
train_neg_f = shuffle(train_neg_f, random_state=SEED)

train_size = 40000
sentences = train_pos_m + train_pos_f + train_neg_m + train_neg_f + test_pos_m + test_pos_f + test_neg_m + test_neg_f
vocab = list(set([item for sublist in sentences for item in sublist]))
id2voc = dict(enumerate(vocab))
voc2id = {v: k for k, v in id2voc.iteritems()}

to_file(project + 'data/processed/author_mention_gender/', voc2id, vocab,
        train_pos_m[:train_size] + test_pos_m,
        train_pos_f[:train_size] + test_pos_f,
        train_neg_m[:train_size] + test_neg_m,
        train_neg_f[:train_size] + test_neg_f)

young, y_ids = tokenize(df[(df['age'] == 0) | (df['age'] == 1)],
                        MIN_SENTENCE_LEN)
_, young = zip(*sorted(zip(y_ids, young)))

old, o_ids = tokenize(
    df[(df['age'] == 2) | (df['age'] == 3) | (df['age'] == 4)],
    MIN_SENTENCE_LEN)
_, old = zip(*sorted(zip(o_ids, old)))

train_pos_y, train_neg_y = mention_split(young[6500:],
                                         min_len=MIN_SENTENCE_LEN)
test_pos_y, test_neg_y = mention_split(young[:6000], min_len=MIN_SENTENCE_LEN)
Esempio n. 3
0
            t = normalize_text(aa.iloc[ind].text)
            if len(t) < min_len:
                continue
            if len(set(t)) == 1 and t[0] == MENTION: continue
            if not all(x in vocab_d for x in t): continue

            try:
                s = ' '.join([w for w in t])
                if s in prev_sent: continue
            except:
                continue

            aa_data.append(t)
        except:
            pass
        if len(aa_data) >= 100000: break
    print 'reached 100k after {0} tweets'.format(ind)
    return wh_data, aa_data


wh, aa = get_race(cleaned, 3)

id2voc = dict(enumerate(vocab))
voc2id = {v: k for k, v in id2voc.iteritems()}

pos_pos, neg_pos = aa[:50000], aa[50000:]
pos_neg, neg_neg = wh[:50000], wh[50000:]

to_file(project + '/data/processed/unseen_race/', voc2id, vocab, pos_pos,
        pos_neg, neg_pos, neg_neg)