Esempi in Python per Alphabet.items

Linguaggio di programmazione: Python

Spazio dei nomi/nome del pacchetto: alphabet

Classe/tipologia: Alphabet

Metodo/funzione: items

Esempi su hotexamples.com: 1

Alphabet.items in Python: 1 esempio trovato. Questo è il miglior esempio reale in Python per alphabet.Alphabet.items, estratto da progetti open source. Lo puoi valutare, per aiutarci a migliorare la qualità dei nostri esempi.

Metodi utilizzati di frequente

Mostra Nascondi

Alphabet(30)

add(26)

close(24)

size(23)

iteritems(8)

get_index(7)

index(5)

load(4)

get_alphabet(3)

save(3)

length(3)

count_letters(3)

clear(3)

ch2index(3)

cross_off(2)

from_iterable(2)

add_symbol(2)

add_many(2)

print_letters(2)

stop_growth(2)

keys(2)

freeze(2)

random_whitespace(1)

shuffle(1)

open(1)

get_content(1)

lock(1)

items(1)

index2ch(1)

get_batch_labels(1)

get(1)

already_guessed(1)

add_observation(1)

add_label(1)

unlock(1)

Esempio n. 1

Mostra file

def main(argv):
    outdir = "preprocessed_data"

    out_file = ''
    out_reduced = ''
    in_file = ''
    max_tweets = np.inf
    fwemb_vocabulary = None
    try:
        opts, args = getopt.getopt(
            argv, "i:o:f:m:", ["ifile=", "ofile=", "wfilter", 'maxTweets'])
    except getopt.GetoptError:
        print 'test.py -i <inputfile> -o <outputfile>'
        sys.exit(2)
    for opt, arg in opts:
        if opt in ("-o", "--ofile"):
            out_file = '{}.pickle'.format(arg)
            out_reduced = '{}_reduced.pickle'.format(arg)
        elif opt in ("-i", "--ifile"):
            in_file = 'semeval/{}.gz'.format(arg)
        elif opt in ('-f', '--wfilter'):
            fwemb_vocabulary = load_glove_vocabulary(
                'embeddings/{}'.format(arg), ' ')
        elif opt in ('-m', '--maxTweets'):
            max_tweets = int(arg)

    print outdir
    if not os.path.exists(outdir):
        os.makedirs(outdir)

    #unsupervised data
    alphabet = Alphabet(start_feature_id=0)
    alphabet.add('UNKNOWN_WORD_IDX')
    dummy_word_idx = alphabet.fid

    tknzr = TweetTokenizer(reduce_len=True)
    fnames_gz = [in_file]

    counter = 0

    for fname in fnames_gz:
        with gzip.open(fname, 'r') as f:
            for tweet in tqdm(f):
                tweet = tknzr.tokenize(preprocess_tweet(tweet))
                for token in tweet:
                    if fwemb_vocabulary:
                        if token in fwemb_vocabulary:
                            alphabet.add(token)
                    else:
                        alphabet.add(token)
                counter += 1
                if (counter % 1000000) == 0:
                    print 'Processed tweets: {}'.format(counter)
                    print 'Alphabet Lenght: {}'.format(len(alphabet))
                if counter > max_tweets:
                    break
        print len(alphabet)

    print 'Alphabet before purge:', len(alphabet)
    cPickle.dump(alphabet, open(os.path.join(outdir, out_file), 'wb'))

    for word, (idx, freq) in tqdm(alphabet.items()):
        if freq > 10:
            alphabet.add(word)

    alphabet.add('DUMMY_WORD_IDX"')
    print "Alphabet after purge:", len(alphabet)
    cPickle.dump(alphabet, open(os.path.join(outdir, out_reduced), 'wb'))