Beispiel #1
0
def is_stimm():
    from preprocess import read_vocab
    from preprocess import vocab
    glove_reduced = "data/glove_100d.txt"
    vocab_file = "data/vocab"
    read_vocab(vocab_file)
    counter = 0
    with open(glove_reduced, "r") as f:
        for line in f.readlines():
            counter += 1

    if len(vocab) == counter:
        print("YESSSSSS")

    print(len(vocab), " <==> ", counter)
Beispiel #2
0
def get_dataset_gender_cooccurrences(data_dir,
                                     vocab,
                                     target_pos=None,
                                     female_nouns=None,
                                     male_nouns=None):
    """
    Get gender cooccurrences for a preprocessed dataset
    """
    if type(vocab) == str:
        # Load vocab if a path was provided
        vocab = read_vocab(vocab)

    female_cooccur = Counter()
    male_cooccur = Counter()

    for fname in os.listdir(data_dir):
        fpath = os.path.join(data_dir, fname)
        # Read file
        sentences = read_preprocessed_file(fpath, vocab)
        female_cooccur_file, male_cooccur_file \
            = get_sentence_list_gender_cooccurrences(sentences, target_pos=target_pos)

        female_cooccur += female_cooccur_file
        male_cooccur += male_cooccur_file

    return female_cooccur, male_cooccur
Beispiel #3
0
def get_dataset_gender_cooccurrences(data_dir,
                                     vocab,
                                     target_pos=None,
                                     female_nouns=None,
                                     male_nouns=None,
                                     n_jobs=1,
                                     verbose=False):
    """
    Get gender cooccurrences for a preprocessed dataset
    """
    if type(vocab) == str:
        # Load vocab if a path was provided
        vocab = read_vocab(vocab)

    female_cooccur = Counter()
    male_cooccur = Counter()

    filepaths = [
        os.path.join(data_dir, fname) for fname in os.listdir(data_dir)
    ]
    worker_args = ((read_preprocessed_file(fpath, vocab), target_pos,
                    female_nouns, male_nouns) for fpath in filepaths)

    num_files = len(filepaths)
    LOGGER.info("Processing {} files.".format(num_files))

    update_interval = max(num_files // 1000, 1)
    pool = Pool(n_jobs)

    last_time = time.time()

    for idx, res in enumerate(pool.imap_unordered(file_worker, worker_args)):
        female_cooccur_sent, male_cooccur_sent = res
        female_cooccur += female_cooccur_sent
        male_cooccur += male_cooccur_sent

        if verbose and ((idx + 1) % update_interval == 0):
            curr_time = time.time()
            LOGGER.info("| Processed {}/{} files. Batch took {}s.".format(
                idx + 1, num_files, curr_time - last_time))
            last_time = curr_time

    return female_cooccur, male_cooccur
Beispiel #4
0
        if batch % args.log_interval == 0 and batch > 0:
            cur_loss = total_loss / args.log_interval  #total_loss.[0]
            elapsed = time.time() - start_time
            print(
                '| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | '
                'loss {:5.2f} | ppl {:8.2f}'.format(
                    epoch, batch,
                    len(train_data) // args.bptt, lr,
                    elapsed * 1000 / args.log_interval, cur_loss,
                    math.exp(cur_loss)))
            total_loss = 0
            start_time = time.time()


#load vocab
vocab = preprocess.read_vocab(os.path.join(args.vocab))

#create json file with indexed filename for following separation
# inds = jams.util.find_with_extension(args.data, 'bin')
inds = [
    os.path.join(args.data, x) for x in os.listdir(args.data)
    if x.endswith('bin')
]

index_train = {}
index_train['id'] = {}
iteration = 0
for ind in inds:
    index_train['id'][iteration] = os.path.basename(ind)
    iteration += 1
Beispiel #5
0
    else:
        torch.cuda.manual_seed(args.seed)

if args.temperature < 1e-3:
    parser.error("--temperature has to be greater or equal 1e-3")

with open(args.checkpoint, 'rb') as f:
    model = torch.load(f)
model.eval()

if args.cuda:
    model.cuda()
else:
    model.cpu()

vocab = preprocess.read_vocab(os.path.join(args.data, 'VOCAB.txt'))
idx_train = pd.read_json('idx_train.json')
idx_val = pd.read_json('idx_val.json')
idx_test = pd.read_json('idx_test.json')

# Load pretrained Embeddings
gn_glove_dir = './gn_glove/1b-vectors300-0.8-0.8.txt'  #142527 tokens, last one is '<unk>'
ntokens = sum(1 for line in open(gn_glove_dir)) + 1

with open(gn_glove_dir) as f:
    gn_glove_vecs = np.zeros((ntokens, 300))
    words2idx_emb = {}
    idx2words_emb = []
    # ordered_words = []
    for i, line in enumerate(f):
        try:
Beispiel #6
0
# #Keeping sentences which are less than max_len = 25
# data_test = data_test[data_test['target'].str.split(" ").str.len() <= max_len]
# data_test = data_test[data_test['source'].str.split(" ").str.len() <= max_len]
data_test = data_test[(data_test['source'].str.split(" ").str.len() <= max_len) & (data_test['target'].str.split(" ").str.len() <= max_len)]
data_test = data_test.reset_index().drop('index',1)

# Concatenating train and test data
entire_data = pd.concat([data,data_test],axis =0)

#Tokenizing the sentences using tokenizer, 
src_sents = pp.preprocess_func(entire_data['source'], tokenizers[source_lang], min_word_count)
trgt_sents = pp.preprocess_func(entire_data['target'], tokenizers[target_lang], min_word_count)

#Generating Source and Target Vocab from Training Data
src_vocab = pp.read_vocab(src_sents)
trgt_vocab = pp.read_vocab(trgt_sents)

#Spliting the data in train and test indices
training_indices = np.arange(0,19431)
test_indices = np.arange(19431,19947)

#Generating preprocessed training and test data with the help of indices
training_source = [src_sents[i] for i in training_indices]
test_source = [src_sents[i] for i in test_indices]

training_target = [trgt_sents[i] for i in training_indices]
test_target = [trgt_sents[i] for i in test_indices]

#Adding 0's to the tensors that is padding according to max length
max_seq_length = max_len + 2  # 2 for EOS_token and SOS_token
Beispiel #7
0
        metrics.classification_report(y_test_cls,
                                      y_pred_cls,
                                      target_names=categories))

    # 混淆矩阵
    print("Confusion Matrix...")
    cm = metrics.confusion_matrix(y_test_cls, y_pred_cls)
    print(cm)

    time_dif = get_time_dif(start_time)
    print("Time usage:", time_dif)


if __name__ == '__main__':
    if len(sys.argv) != 2 or sys.argv[1] not in ['train', 'test']:
        raise ValueError("""usage: python run_cnn.py [train / test]""")

    print('Configuring CNN model...')
    config = TCNNConfig()
    if not os.path.exists(vocab_dir):  # 如果不存在词汇表,重建
        build_vocab(train_dir, vocab_dir, config.vocab_size)
    categories, cat_to_id = read_category()
    words, word_to_id = read_vocab(vocab_dir)
    config.vocab_size = len(words)
    model = TextCNN(config)
    # train()

    if sys.argv[1] == 'train':
        train()
    else:
        test()
def vector_to_dict():
    read_vocab()
    vector_read()
    dictionary = dict(zip(vocab_array, vector_array))
    return dictionary