コード例 #1
0
ファイル: gqa_loader.py プロジェクト: yuzcccc/openvqa
    def tokenize(self, stat_ques_dict, use_glove):
        token_to_ix = {
            'PAD': 0,
            'UNK': 1,
            'CLS': 2,
        }

        spacy_tool = None
        pretrained_emb = []
        if use_glove:
            spacy_tool = en_vectors_web_lg.load()
            pretrained_emb.append(spacy_tool('PAD').vector)
            pretrained_emb.append(spacy_tool('UNK').vector)
            pretrained_emb.append(spacy_tool('CLS').vector)

        max_token = 0
        for qid in stat_ques_dict:
            ques = stat_ques_dict[qid]['question']
            words = re.sub(r"([.,'!?\"()*#:;])", '',
                           ques.lower()).replace('-',
                                                 ' ').replace('/',
                                                              ' ').split()

            if len(words) > max_token:
                max_token = len(words)

            for word in words:
                if word not in token_to_ix:
                    token_to_ix[word] = len(token_to_ix)
                    if use_glove:
                        pretrained_emb.append(spacy_tool(word).vector)

        pretrained_emb = np.array(pretrained_emb)

        return token_to_ix, pretrained_emb, max_token
コード例 #2
0
ファイル: data_utils.py プロジェクト: J-BING/mcan-vqa
def tokenize(stat_ques_list, use_glove):
    token_to_ix = {
        'PAD': 0,
        'UNK': 1,
    }

    spacy_tool = None
    pretrained_emb = []
    if use_glove:
        spacy_tool = en_vectors_web_lg.load()
        pretrained_emb.append(spacy_tool('PAD').vector)
        pretrained_emb.append(spacy_tool('UNK').vector)

    for ques in stat_ques_list:
        words = re.sub(r"([.,'!?\"()*#:;])", '',
                       ques['question'].lower()).replace('-', ' ').replace(
                           '/', ' ').split()

        for word in words:
            if word not in token_to_ix:
                token_to_ix[word] = len(token_to_ix)
                if use_glove:
                    pretrained_emb.append(spacy_tool(word).vector)

    pretrained_emb = np.array(pretrained_emb)

    return token_to_ix, pretrained_emb
コード例 #3
0
    def tokenize(self, stat_caps_list, use_glove):
        max_token = 0
        token_to_ix = {
            'PAD': 0,
            'UNK': 1,
            'CLS': 2,
        }

        spacy_tool = None
        pretrained_emb = []
        if use_glove:
            spacy_tool = en_vectors_web_lg.load()
            pretrained_emb.append(spacy_tool('PAD').vector)
            pretrained_emb.append(spacy_tool('UNK').vector)
            pretrained_emb.append(spacy_tool('CLS').vector)

        for cap in stat_caps_list:
            words = re.sub(r"([.,'!?\"()*#:;])", '',
                           cap.lower()).replace('-', ' ').replace('/',
                                                                  ' ').split()
            max_token = max(len(words), max_token)
            for word in words:
                if word not in token_to_ix:
                    token_to_ix[word] = len(token_to_ix)
                    if use_glove:
                        pretrained_emb.append(spacy_tool(word).vector)

        pretrained_emb = np.array(pretrained_emb)

        return token_to_ix, pretrained_emb, max_token
コード例 #4
0
ファイル: tokenize.py プロジェクト: wangxl1998/MOSEI_UMONS
def create_dict(key_to_sentence, dataroot, use_glove=True):
    token_file = dataroot + "/token_to_ix.pkl"
    glove_file = dataroot + "/train_glove.npy"
    if os.path.exists(glove_file) and os.path.exists(token_file):
        print("Loading train language files")
        return pickle.load(open(token_file, "rb")), np.load(glove_file)

    print("Creating train language files")
    token_to_ix = {
        'UNK': 1,
    }

    spacy_tool = None
    pretrained_emb = []
    if use_glove:
        spacy_tool = en_vectors_web_lg.load()
        pretrained_emb.append(spacy_tool('UNK').vector)

    for k, v in key_to_sentence.items():
        for word in v:
            if word not in token_to_ix:
                token_to_ix[word] = len(token_to_ix)
                if use_glove:
                    pretrained_emb.append(spacy_tool(word).vector)

    pretrained_emb = np.array(pretrained_emb)
    np.save(glove_file, pretrained_emb)
    pickle.dump(token_to_ix, open(token_file, "wb"))
    return token_to_ix, pretrained_emb
コード例 #5
0
ファイル: utils.py プロジェクト: baophuc27/answer-generation
def get_pretrained_emd_OOV(all_tokens):
    glove = en_vectors_web_lg.load()
    pretrained_emb = []
    if isinstance(all_tokens, list):
        for token in all_tokens:
            pretrained_emb.append(glove(token).vector)
    return pretrained_emb
コード例 #6
0
def calculate_similarity(src_files, bug_reports):

    # Loading word vectors
    nlp = en_vectors_web_lg.load()

    src_docs = [
        nlp(' '.join(src.file_name['unstemmed'] +
                     src.class_names['unstemmed'] +
                     src.attributes['unstemmed'] + src.comments['unstemmed'] +
                     src.method_names['unstemmed']))
        for src in src_files.values()
    ]

    min_max_scaler = MinMaxScaler()

    all_simis = []
    for report in bug_reports.values():
        report_doc = nlp(' '.join(report.summary['unstemmed'] +
                                  report.pos_tagged_description['unstemmed']))
        scores = []
        for src_doc in src_docs:
            simi = report_doc.similarity(src_doc)
            scores.append(simi)

        scores = np.array([float(count) for count in scores]).reshape(-1, 1)
        normalized_scores = np.concatenate(
            min_max_scaler.fit_transform(scores))

        all_simis.append(normalized_scores.tolist())

    return all_simis
コード例 #7
0
ファイル: utils.py プロジェクト: baophuc27/answer-generation
def get_pretrained_emb(all_tokens):
    glove = en_vectors_web_lg.load()
    pretrained_emb = []

    for token_ix in all_tokens:
        pretrained_emb.append(glove(all_tokens[token_ix]).vector)

    return pretrained_emb
コード例 #8
0
    def tokenize(self, json_file, use_glove):
        token_to_ix, max_token = json.load(open(json_file, 'r'))[2:]
        spacy_tool = None
        if use_glove:
            spacy_tool = en_vectors_web_lg.load()

        pretrained_emb = []
        for word in token_to_ix:
            if use_glove:
                pretrained_emb.append(spacy_tool(word).vector)
        pretrained_emb = np.array(pretrained_emb)

        return token_to_ix, pretrained_emb, max_token
コード例 #9
0
    def tokenize(self, stat_ques_list, use_glove):
        t1 = time.time()
        token_to_ix = {
            'PAD': 0,
            'UNK': 1,
            'CLS': 2,
        }

        spacy_tool = None
        pretrained_emb = []
        if use_glove:
            spacy_tool = en_vectors_web_lg.load()
            pretrained_emb.append(spacy_tool('PAD').vector)
            pretrained_emb.append(spacy_tool('UNK').vector)
            pretrained_emb.append(spacy_tool('CLS').vector)

        t2 = time.time()
        print("first part: %f" % (t2 - t1))

        for ques in stat_ques_list:
            words = re.sub(r"([.,'!?\"()*#:;])", '',
                           ques['question'].lower()).replace('-', ' ').replace(
                               '/', ' ').split()

            for word in words:
                if word not in token_to_ix:
                    token_to_ix[word] = len(token_to_ix)
                    if use_glove:
                        pretrained_emb.append(spacy_tool(word).vector)

        pretrained_emb = np.array(pretrained_emb)

        t3 = time.time()
        print("second part: %f" % (t3 - t2))

        return token_to_ix, pretrained_emb
コード例 #10
0
    def __init__(self,
                 opt,
                 folder='./result',
                 batchsize=64,
                 max_length=15,
                 mode='train'):
        self.opt = opt
        self.batchsize = batchsize
        self.d_vocabulary = None
        self.batch_index = None
        self.batch_len = None
        self.rev_adict = None
        self.max_length = max_length
        self.mode = mode
        self.qdic, self.adic = VQADataProvider.load_data(mode)

        with open('./%s/vdict.json' % folder, 'r') as f:
            self.vdict = json.load(f)
        with open('./%s/adict.json' % folder, 'r') as f:
            self.adict = json.load(f)

        self.n_ans_vocabulary = len(self.adict)
        self.nlp = en_vectors_web_lg.load()
        self.glove_dict = {}  # word -> glove vector
コード例 #11
0
ファイル: load_data_vgd.py プロジェクト: zhwzhong/mmnas
    def tokenize(self, stat_refs_list, use_glove):
        token_to_ix = {
            'PAD': 0,
            'UNK': 1,
            'CLS': 2,
        }
        spacy_tool = None
        pretrained_emb = []
        if use_glove:
            spacy_tool = en_vectors_web_lg.load()
            pretrained_emb.append(spacy_tool('PAD').vector)
            pretrained_emb.append(spacy_tool('UNK').vector)
            pretrained_emb.append(spacy_tool('CLS').vector)

        for ref_ in stat_refs_list:
            words = ref_['tokens']
            for word in words:
                if word not in token_to_ix:
                    token_to_ix[word] = len(token_to_ix)
                    if use_glove:
                        pretrained_emb.append(spacy_tool(word).vector)
        pretrained_emb = np.array(pretrained_emb)

        return token_to_ix, pretrained_emb
コード例 #12
0
def tokenize(qns_list):
    token_to_ix = {
        'PAD': 0,
        'UNK': 1,
    }

    pretrained_emb = []
    spacy_tool = en_vectors_web_lg.load()
    pretrained_emb.append(spacy_tool('PAD').vector)
    pretrained_emb.append(spacy_tool('UNK').vector)

    for qn in qns_list:
        words = re.sub(r"([.,'!?\"()*#:;])", '',
                       qn['question'].lower()).replace('-', ' ').replace(
                           '/', ' ').split()

        for word in words:
            if word not in token_to_ix:
                token_to_ix[word] = len(token_to_ix)
                pretrained_emb.append(spacy_tool(word).vector)

    pretrained_emb = np.array(pretrained_emb)

    return token_to_ix, pretrained_emb
コード例 #13
0
ファイル: vqa_loader.py プロジェクト: itsShnik/VQA-MIB
    def __init__(self, __C):
        super(DataSet, self).__init__()
        self.__C = __C

        # --------------------------
        # ---- Raw data loading ----
        # --------------------------

        print("Loading all questions (for statistics)")
        # Loading question word list
        stat_ques_list = \
                json.load(open(__C.RAW_PATH[__C.DATASET]['train'], 'r'))['questions'] + \
                json.load(open(__C.RAW_PATH[__C.DATASET]['val'], 'r'))['questions'] + \
                json.load(open(__C.RAW_PATH[__C.DATASET]['test'], 'r'))['questions'] + \
                json.load(open(__C.RAW_PATH[__C.DATASET]['vg'], 'r'))['questions']

        '''
        stat_ques_list = [
            {'image_id': 458752, 'question': 'What is this photo taken looking through?', 'question_id': 458752000},
            {'image_id': 458752, 'question': 'What position is this man playing?', 'question_id': 458752001}
        ]
        '''

        # Loading answer word list
        stat_ans_list = \
                json.load(open(__C.RAW_PATH[__C.DATASET]['train-anno'], 'r'))['annotations'] + \
                json.load(open(__C.RAW_PATH[__C.DATASET]['val-anno'], 'r'))['annotations']

        if self.__C.USE_NEW_QUESTION == "False":

            print("Loading all image features")
            # Loading all image paths
            frcn_feat_path_list = \
                glob.glob(__C.FEATS_PATH[__C.DATASET]['train'] + '/*.npz') + \
                glob.glob(__C.FEATS_PATH[__C.DATASET]['val'] + '/*.npz') + \
                glob.glob(__C.FEATS_PATH[__C.DATASET]['test'] + '/*.npz')

            # Loading question and answer list
            self.ques_list = []
            self.ans_list = []

            print("Loading split questions and answers")
            split_list = __C.SPLIT[__C.RUN_MODE].split('+')
            for split in split_list:
                self.ques_list += json.load(open(__C.RAW_PATH[__C.DATASET][split], 'r'))['questions']
                #if __C.RUN_MODE in ['train']:
                self.ans_list += json.load(open(__C.RAW_PATH[__C.DATASET][split + '-anno'], 'r'))['annotations']

            # Define run data size
            if __C.RUN_MODE in ['train']:
                self.data_size = self.ans_list.__len__()
            else:
                self.data_size = self.ques_list.__len__()

            # assert len(self.ques_list) == len(self.ans_list), "Size of question list and answer list does not match"
            print(' ========== Dataset size:', self.data_size)

        if self.__C.USE_NEW_QUESTION == "True" and self.__C.RUN_MODE == "test":

            print("Loading image features of image_id: {}".format(self.__C.IMAGE_ID))
            # Loading all image paths
            frcn_feat_path_list = glob.glob(__C.FEATS_PATH[__C.DATASET]['test'] + '/COCO_test2015_' + str(self.__C.IMAGE_ID).zfill(12) + '.jpg.npz')

            print("Loading the specified question")
            # Loading question word list
            temp_json = {}
            temp_json['image_id'] = self.__C.IMAGE_ID 
            temp_json['question'] = self.__C.NEW_QUESTION
            temp_json['question_id'] = 100000001

            self.ques_list = [temp_json]
            self.data_size = self.ques_list.__len__()
            print(' ========== Dataset size:', self.data_size)


        # ------------------------
        # ---- Data statistic ----
        # ------------------------

        # {image id} -> {image feature absolutely path}
        self.iid_to_frcn_feat_path = self.img_feat_path_load(frcn_feat_path_list)

        # {question id} -> {question}
        self.qid_to_ques = self.ques_load(self.ques_list)

        '''
        To print 2 iterms from each dictionary
        from itertools import islice
        print( list(islice(self.qid_to_ques.items(), 2)))
        print( list(islice(self.iid_to_frcn_feat_path.items(), 2)))

        qid_to_ques = {
            '458752000': {'image_id': 458752, 'question': 'What is this photo taken looking through?', 'question_id': 458752000},
            '458752001': {'image_id': 458752, 'question': 'What position is this man playing?', 'question_id': 458752001}
        }
        iid_to_frcn_feat_path = {
            '187465': './data/vqa/feats/train2014/COCO_train2014_000000187465.jpg.npz', 
            '78909': './data/vqa/feats/train2014/COCO_train2014_000000078909.jpg.npz'
        }
        '''

        # Tokenize
        # For tokenizing we need a spacy tool, declaring here
        self.spacy_tool = en_vectors_web_lg.load()
        print('')
        # No of different words in all questions combined
        print("Tokenising questions")
        self.token_to_ix, self.pretrained_emb = self.tokenize(stat_ques_list, __C.USE_GLOVE)
        self.token_size = self.token_to_ix.__len__()
        print(' ========== Question token vocab size:', self.token_size)

        # Answers statistic
        # Tokenize and make a vocabulary of each word in the answer as seperate tokens

        #Edits
        #Added the initialization of these two only when mode is train
        #if __C.RUN_MODE in ['train']:
        print("Tokenising answers")
        # No of different words in all answers combined
        self.token_to_ix_ans , self.pretrained_emb_ans = self.tokenize_ans(stat_ans_list, __C.USE_GLOVE)
        self.token_size_ans = self.token_to_ix_ans.__len__()
        print(" ========== Answer token vocab size: ", self.token_size_ans)
        '''
        token_to_ix_ans = {
            'PAD': 0, 
            'UNK': 1,
            'CLS': 2, 
            'net': 3,
            'pitcher': 4,
            'orange': 5,
            'yes': 6,
            'white': 7,
            'skiing': 8,
            'red': 9,
            'frisbee': 10,
            .
            .
            .
        }
        # To print first 50 items
        from itertools import islice
        print( list(islice(self.token_to_ix_ans.items(), 50)))
        sys.exit(0)
        '''

        #ENd of our edit

        ans_freq = 8
        self.ans_to_ix, self.ix_to_ans = self.ans_stat_from_file('openvqa/datasets/vqa/answer_dict.json')
        # self.ans_to_ix, self.ix_to_ans = self.ans_stat(stat_ans_list, ans_freq=ans_freq)
        self.ans_size = self.ans_to_ix.__len__()
        print(' ========== Answer token vocab size (occur more than {} times):'.format(ans_freq), self.ans_size)
        print('Finished!')
        print('')
コード例 #14
0
ファイル: sav.py プロジェクト: martibook/SemanPhone
"""
calculate semantic association value

value locates in [0, 1]
"""

# import spacy
# nlp = spacy.load('en_vectors_web_lg')

import en_vectors_web_lg
nlp = en_vectors_web_lg.load()


def SAV(word1, word2):
    """calculate semantic association value of two words

    @word1 -- the first word
    @word2 -- the second word
    @return -- a float number means semantic association value
    """
    vector1 = nlp.vocab[word1]
    vector2 = nlp.vocab[word2]
    return vector1.similarity(vector2)


def main(word1, word2):
    """for testing functions in this module
    """
    print('word1: ', word1)
    print('word2: ', word2)
    print('semantic association value: {v: .2%}'.format(v=SAV(word1, word2)))
コード例 #15
0
DENSE_HIDDEN_UNITS = 4 * LSTM_UNITS
EPOCHS = 15
MAX_LEN = 40
LEARN_RATE = 0.001
BY_SENTENCE = True
CHARS_TO_REMOVE = '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n“”’\'∞θ÷α•à−β∅³π‘₹´°£€\×™√²—'

if os.name == 'nt':
    path = 'C:\\Users\\EGimenez\\Programming\\PyCharmProjects\\Kaggle\\data\\Tweets\\'
else:
    path = '/content/'

spell = SpellChecker()
print("Loading spaCy")
nlp_ents = en_core_web_sm.load()
nlp_vects = en_vectors_web_lg.load()
nlp_vects.add_pipe(nlp_vects.create_pipe("sentencizer"))

train_df = pd.read_csv(os.path.join(path, 'train.csv'))
test_df = pd.read_csv(os.path.join(path, 'test.csv'))


def manage_spelling(tweets):
    for i, tweet in enumerate(tweets):

        if i % 25 == 0:
            print(i)

        line = str(tweet)
        for word in tweet:
            if not word.has_vector: