Exemple #1
0
def charify(line):
    text = line.txt
    if text is not None:
        chars = list(normalize_tweet(text))
        line.chars = chars
        return chars
    else:
        return None
Exemple #2
0
def charify(line):
    text = line.txt
    if text is not None:
        chars = list(normalize_tweet(text))
        line.chars = chars
        return chars
    else:
        return None
Exemple #3
0
def preprocess_semeval(infile, outfile, vocab=None, label_dict=None):
    tweets = load_from_tsv(infile, subtask_id='a')
    tweets = filter(lambda t: t.label != 'neutral', tweets)
    print('loaded %d tweets from %s' % (len(tweets), infile))
    texts = map(lambda t: t.raw_text, tweets)
    labels = map(lambda t: t.label, tweets)
    mk_label_map = False
    if label_dict is None:
        mk_label_map = True
        label_set = list(set(labels))
        label_dict = {x: label_set.index(x) for x in label_set}
    label_ints = map(lambda t: label_dict[t.label], tweets)
    mk_vocab = False
    if vocab is None:
        mk_vocab = True
        vocab = defaultdict(int)
    ntexts = []
    ct = 0
    for text in texts:
        ntext = normalize_tweet(text)
        ntexts.append(ntext)
        if mk_vocab:
            chars = list(ntext)
            for c in chars:
                if c not in vocab:
                    vocab[c] = ct
                    ct += 1
    print(vocab)
    print
    lines = []
    for i, text in enumerate(ntexts):
        ints = map(lambda c: vocab[c], text)
        ints_str = ' '.join([str(c) for c in ints])
        label = str(label_ints[i])
        line = label + "\t" + ints_str
        lines.append(line)

    if mk_vocab:
        vocab_filename = '%s.vocab.pkl' % outfile
        print('writing vocab to %s' % vocab_filename)
        cPickle.dump(vocab, open(vocab_filename, 'w'))
    if mk_label_map:
        label_filename = '%s.labels.pkl' % outfile
        print('writing label map to %s' % label_filename)
        cPickle.dump(label_dict, open(label_filename, 'w'))

    print('writing output to %s' % outfile)
    with open(outfile, 'w') as f:
        for line in lines:
            f.write(line + '\n')
Exemple #4
0
def preprocess_semeval(infile, outfile, vocab=None, label_dict=None):
    tweets = load_from_tsv(infile, subtask_id='a')
    tweets = filter(lambda t: t.label != 'neutral', tweets)
    print('loaded %d tweets from %s' % (len(tweets), infile))
    texts = map(lambda t: t.raw_text, tweets)
    labels = map(lambda t: t.label, tweets)
    mk_label_map = False
    if label_dict is None:
        mk_label_map = True
        label_set = list(set(labels))
        label_dict = {x: label_set.index(x) for x in label_set}
    label_ints = map(lambda t: label_dict[t.label], tweets)
    mk_vocab = False
    if vocab is None:
        mk_vocab = True
        vocab = defaultdict(int)
    ntexts = []
    ct = 0
    for text in texts:
        ntext = normalize_tweet(text)
        ntexts.append(ntext)
        if mk_vocab:
            chars = list(ntext)
            for c in chars:
                if c not in vocab:
                    vocab[c] = ct
                    ct += 1
    print(vocab)
    print
    lines = []
    for i, text in enumerate(ntexts):
        ints = map(lambda c: vocab[c], text)
        ints_str = ' '.join([str(c) for c in ints])
        label = str(label_ints[i])
        line = label + "\t" + ints_str
        lines.append(line)

    if mk_vocab:
        vocab_filename = '%s.vocab.pkl' % outfile
        print('writing vocab to %s' % vocab_filename)
        cPickle.dump(vocab, open(vocab_filename, 'w'))
    if mk_label_map:
        label_filename = '%s.labels.pkl' % outfile
        print('writing label map to %s' % label_filename)
        cPickle.dump(label_dict, open(label_filename, 'w'))

    print('writing output to %s' % outfile)
    with open(outfile, 'w') as f:
        for line in lines:
            f.write(line + '\n')