Esempio n. 1
0
def main():
    args = get_args()
    logger = get_logger(args.write_log)
    
    os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu
    device_ids_str = args.gpu.split(',')
    device_ids = []
    for i in range(len(device_ids_str)):
        device_ids.append(i)

    multi_gpu = False
    if args.mode != "prep":
        logger.info("Loading network")
        model = AdaMatting(in_channel=4)
        optimizer = torch.optim.Adam(model.parameters(), lr=args.lr, weight_decay=0)
        if args.cuda:
            device = torch.device("cuda:{}".format(device_ids[0]))
            if len(device_ids) > 1 and args.mode=="train":
                logger.info("Loading with multiple GPUs")
                model = torch.nn.DataParallel(model, device_ids=device_ids)
                multi_gpu = True
            model = model.cuda(device=device_ids[0])
        else:
            device = torch.device("cpu")

    if args.mode == "train":
        logger.info("Program runs in train mode")
        train(model=model, optimizer=optimizer, device=device, args=args, logger=logger, multi_gpu=multi_gpu)
    elif args.mode == "test":
        logger.info("Program runs in test mode")
        test()
    elif args.mode == "prep":
        logger.info("Program runs in prep mode")
        # composite_dataset(args.raw_data_path, logger)
        gen_train_valid_names(args.valid_portion, logger)
    def _text2idx(df_texts, word2idx):
        args = get_args()
        if args.use_paragraph:
            if args.word_seq:
                return np.array(
                    list(
                        map(
                            lambda paragraph: list(
                                map(
                                    lambda word: word2idx[word]
                                    if word in word2idx.keys() else word2idx[
                                        '<unk>'], paragraph)), df_texts)))

            else:
                return np.array(
                    list(
                        map(
                            lambda paragraph: list(
                                map(
                                    lambda sent: list(
                                        map(
                                            lambda word: word2idx[word]
                                            if word in word2idx.keys() else
                                            word2idx['<unk>'], sent.split())),
                                    paragraph[:args.first_k_paras])),
                            df_texts)))
        else:
            if args.word_seq:
                return np.array(
                    list(
                        map(
                            lambda paragraph: list(
                                map(
                                    lambda word: word2idx[word]
                                    if word in word2idx.keys() else word2idx[
                                        '<unk>'], paragraph)), df_texts)))
            else:
                return np.array(
                    list(
                        map(
                            lambda doc: list(
                                map(
                                    lambda sent: list(
                                        map(
                                            lambda word: word2idx[word]
                                            if word in word2idx.keys()
                                            else word2idx['<unk>'], sent.split(
                                            ))), doc[:args.first_k_sents])),
                            df_texts)))
    def assign_word_ids(emb_size, df_texts, special_tokens=["<pad>", "<unk>"]):
        """
        Given df_texts (list of sent tokens), create word2id and id2word
        based on the most common words
        :param  df_text: list of sent tokens
        :param special_tokens: set of special tokens to add into dictionary
        :param vocab_size: max_number of vocabs
        :return: word2id, id2word
        """
        args = get_args()
        id = 0
        word2id = {}
        # add special tokens in w2i
        for tok in special_tokens:
            word2id[tok] = id
            id += 1
            print(tok, word2id[tok])
        word_set = [
            word for doc in df_texts for sent in doc for word in sent.split()
        ]
        # elif p.model_type == 'word':
        #     word_set = [word for doc in df_texts for word in doc]
        c = Counter(word_set)

        ## if max_vocab is not -1, then shrink the word size
        train_words = list(c.keys())
        if args.use_glove:
            embsize_index = {200: '6B.', 300: '840B.'}
            glove_words = pickle.load(
                open(
                    p.glove_dir + embsize_index[emb_size] + str(emb_size) +
                    '_idx.pkl', 'rb'))
            # unks are the words that have <= 5 frequency and NOT found in gloves
            unks = [word for word in train_words if c[word] <= 5]
            unks = list(set(unks) - set(glove_words))
        else:
            unks = [word for word in train_words if c[word] <= 5]
        # print(unks)
        print("Number of unks: " + str(len(unks)))

        vocab = list(set(train_words) - (set(unks)))
        # add regular words in
        for word in vocab:
            word2id[word] = id
            id += 1
        id2word = {v: k for k, v in word2id.items()}
        # print('finishing processing %d vocabs' % len(word2id))
        return word2id, id2word
    def __init__(self):
        args = get_args()

        if args.use_paragraph:
            if p.three_parts_split:
                data_train = pd.read_json(p.sent_split_dir +
                                          "train_3_parts_split.json")
                data_val = pd.read_json(p.sent_split_dir +
                                        "val_3_parts_split.json")
                data_test = pd.read_json(p.sent_split_dir +
                                         "test_3_parts_split.json")
            else:
                if args.word_seq:
                    data_train = pd.read_json(p.word_seq_dir +
                                              "train_paragraph_word.json")
                    data_val = pd.read_json(p.word_seq_dir +
                                            "val_paragraph_word.json")
                    data_test = pd.read_json(p.word_seq_dir +
                                             "test_paragraph_word.json")
                else:
                    data_train = pd.read_json(p.sent_split_dir +
                                              "train_paragraph_sentence.json")
                    data_val = pd.read_json(p.sent_split_dir +
                                            "val_paragraph_sentence.json")
                    data_test = pd.read_json(p.sent_split_dir +
                                             "test_paragraph_sentence.json")
        else:
            if args.word_seq:
                data_train = pd.read_json(p.word_seq_dir +
                                          "train_document_word.json")
                data_val = pd.read_json(p.word_seq_dir +
                                        "val_document_word.json")
                data_test = pd.read_json(p.word_seq_dir +
                                         "test_document_word.json")
            else:
                data_train = pd.read_json(p.sent_split_dir +
                                          "train_document_sentence.json")
                data_val = pd.read_json(p.sent_split_dir +
                                        "val_document_sentence.json")
                data_test = pd.read_json(p.sent_split_dir +
                                         "test_document_sentence.json")

        if p.small_data:
            data_train = data_train.head(50)
            data_val = data_val.head(50)
            data_test = data_test.head(50)

        self.X_train = data_train.tokens
        self.X_val = data_val.tokens
        self.X_test = data_test.tokens

        self.y_train = np.array(data_train.label.values)
        self.y_val = np.array(data_val.label.values)
        self.y_test = np.array(data_test.label.values)

        if args.dataset == "yelp":
            for labels in [self.y_train, self.y_val, self.y_test]:
                labels[labels == 1] = 0
                labels[labels == 2] = 1
        if args.dataset == "ag_news":
            for labels in [self.y_train, self.y_val, self.y_test]:
                for label in list(range(4)):
                    labels[labels == label + 1] = label
        data_train['label'] = data_train['label'].apply(
            lambda x: 0 if x == 1 else 1).astype('int64')
        data_val['label'] = data_val['label'].apply(
            lambda x: 0 if x == 1 else 1).astype('int64')
        data_test['label'] = data_test['label'].apply(
            lambda x: 0 if x == 1 else 1).astype('int64')

    if p.small_data:
        data_train = data_train.head(50)
        data_val = data_val.head(50)
        data_test = data_test.head(50)
    return data_train, data_val, data_test


if __name__ == '__main__':
    config = get_args()
    data_train, data_val, data_test = load_data()
    config.class_number = data_train['label'].nunique()
    config.levels = 3
    print(config)
    train_grouped = data_train.groupby("id")
    val_grouped = data_val.groupby("id")
    test_grouped = data_test.groupby("id")
    num_batches = len(train_grouped)

    if config.build_vocab:
        X_train = data_train.tokens
        X_val = data_val.tokens
        X_test = data_test.tokens
        dictionary = construct_dictionary(X_train, X_val, X_test)
    else:
Esempio n. 6
0
from utility import Dictionary, get_args
from dataset import TextDataset
import pickle
import parameters as p
import numpy as np
import bcolz
import pandas as pd

args = get_args()


def load_data_set():
    dataset = TextDataset()
    print("finished reading datasets")
    if not args.build_vocab:
        with open(p.dict_path, 'rb') as f:
            dictionary = pickle.load(f)
    else:
        dictionary = construct_dictionary(dataset.X_train, dataset.X_val,
                                          dataset.X_test)
    return dataset, dictionary.word2idx, dictionary.idx2word


def construct_dictionary(data_train, data_val, data_test):
    if args.use_val:
        dataset = pd.concat([data_train, data_val], 0)
        if args.use_test:
            dataset = pd.concat([dataset, data_test], 0)
    else:
        dataset = data_train
    print("constructing doctionary...")