Ejemplo n.º 1
0
def make_word_embedding(input_file: Path, word_embedding: str):
    # 加载word embedding
    wv = KeyedVectors.load_word2vec_format(word_embedding,
                                           binary=False,
                                           encoding='utf-8',
                                           unicode_errors='ignore')

    word_set = set()
    # 按词分
    with input_file.open('r') as f:
        for line in tqdm(f):
            json_line = json.loads(line)
            word_set = word_set.union(set(jieba.lcut(json_line['text'])))

    stoi = defaultdict(int)
    itos = defaultdict(str)
    vectors = []
    add_pad_unk(stoi, itos, vectors, wv)
    for idx, word in enumerate(word_set):
        if word in wv.vocab:
            stoi[word] = len(stoi)
            itos[len(itos)] = word
            vectors.append(wv.get_vector(word))
    word_embedding = WordEmbedding(stoi=stoi, itos=itos, vectors=vectors)

    # 按字分
    char_set = set()
    with input_file.open('r') as f:
        for line in tqdm(f):
            json_line = json.loads(line)
            char_set = char_set.union(set(list(json_line['text'])))

    stoi = defaultdict(int)
    itos = defaultdict(str)
    vectors = []
    add_pad_unk(stoi, itos, vectors, wv)
    for idx, char in enumerate(char_set):
        if char in wv.vocab:
            stoi[char] = len(stoi)
            itos[len(itos)] = char
            vectors.append(wv.get_vector(char))

    char_embedding = WordEmbedding(stoi=stoi, itos=itos, vectors=vectors)

    word_embedding_cache = Path(
        '../word_embedding/.cache/medical_word_embedding.pkl').open('wb')
    char_embedding_cache = Path(
        '../word_embedding/.cache/medical_char_embedding.pkl').open('wb')
    pickle.dump(word_embedding, word_embedding_cache)
    pickle.dump(char_embedding, char_embedding_cache)
    word_embedding_cache.close()
    char_embedding_cache.close()
Ejemplo n.º 2
0
    def ocr_feature_generation_h5(self):
        self.node_feature_h5.create_dataset(
            "ocr_token_embeddings", (self.n_images, self.max_n_ocr, 300),
            dtype='float32')
        self.node_feature_h5.create_dataset("ocr_bounding_boxes",
                                            (self.n_images, self.max_n_ocr, 8),
                                            dtype='float32')

        word_embed = WordEmbedding('fasttext', self.word_emb_config)
        for image_index in tqdm(range(self.n_images),
                                unit='image',
                                desc='Ocr feature generation'):
            image_id = self.image_ix_to_id[str(image_index)]
            image_ocr = self.ocr[image_id]
            image_ocr_token_embeddings = np.zeros((self.max_n_ocr, 300),
                                                  dtype='float32')
            image_ocr_bounding_boxes = np.zeros((self.max_n_ocr, 8),
                                                dtype='float32')
            for ocr_index, (ocr_token, bbox) in enumerate(image_ocr.items()):
                image_ocr_token_embeddings[ocr_index] = word_embed(ocr_token)
                image_ocr_bounding_boxes[ocr_index] = np.array(bbox).flatten()
            self.node_feature_h5["ocr_token_embeddings"][image_index] = \
                image_ocr_token_embeddings
            self.node_feature_h5["ocr_bounding_boxes"][image_index] = \
                image_ocr_bounding_boxes
Ejemplo n.º 3
0
    def __init__(self, args):
        super().__init__()
        self.args = args
        self.bn = nn.BatchNorm1d(args.hidden_size)
        self.word_embed = WordEmbedding(args.vocab_size, 300, .0)
        self.sent_embed = nn.LSTM(args.embed_size,
                                  args.hidden_size,
                                  2,
                                  dropout=args.dropout,
                                  batch_first=True)
        self.sent_embed = DynamicRNN(self.sent_embed)
        self.hist_embed = nn.LSTM(args.embed_size,
                                  args.hidden_size,
                                  2,
                                  dropout=args.dropout,
                                  batch_first=True)
        self.hist_embed = DynamicRNN(self.hist_embed)
        self.bup_att = FIND(2048, 1024, 1024)

        self.q_net = FCNet([1024, 1024])
        self.v_net = FCNet([2048, 1024])
        self.linear = nn.Linear(args.hidden_size * 2, args.hidden_size)

        self.layer_stack = nn.ModuleList([
            REFER(d_model=512,
                  d_inner=1024,
                  n_head=4,
                  d_k=256,
                  d_v=256,
                  dropout=0.2) for _ in range(2)
        ])
Ejemplo n.º 4
0
 def __init__(self, opt):
     self.opt = opt
     print('> training arguments:')
     for arg in vars(opt):
         print('>>> {0}: {1}'.format(arg, getattr(opt, arg)))
     w2v_path = {
         'word': '/data/word2vec/sgns.financial.word',
         'word_bigram': '/data/word2vec/sgns.financial.bigram',
         'word_char': '/data/word2vec/sgns.financial.char',
         'word_char_bigram': '/data/word2vec/sgns.financial.bigram-char',
         'char': '/data/word2vec/sgns.char'
     }
     embed = WordEmbedding(
         os.path.dirname(__file__) + w2v_path[opt.vector_level])
     c2v_path = '/data/word2vec/sgns.char'
     charEmbed = WordEmbedding(os.path.dirname(__file__) + c2v_path)
     train_set = TextDataSet(os.path.dirname(__file__) +
                             '/data/test_data.csv',
                             embed,
                             charEmbed,
                             max_seq_len=opt.max_seq_len,
                             vector_level=opt.vector_level,
                             train=True)
     test_set = TextDataSet(os.path.dirname(__file__) +
                            '/data/test_data.csv',
                            embed,
                            charEmbed,
                            max_seq_len=opt.max_seq_len,
                            vector_level=opt.vector_level,
                            test=True)
     self.train_data_loader = DataLoader(dataset=train_set,
                                         batch_size=opt.batch_size,
                                         shuffle=True)
     self.test_data_loader = DataLoader(dataset=test_set,
                                        batch_size=opt.batch_size,
                                        shuffle=True)
     self.writer = SummaryWriter(log_dir=opt.logdir)
     if opt.model_name in ['tmp', 'lstm_cnn']:
         self.model = opt.model_class(embed.m, charEmbed.m,
                                      opt).to(opt.device)
     else:
         self.model = opt.model_class(embed.m, opt).to(opt.device)
     self.reset_parameters()
Ejemplo n.º 5
0
 def object_name_embedding_generation(self):
     word_embed = WordEmbedding('glove', self.word_emb_config)
     for image_index in tqdm(range(self.n_images),
                             unit='image',
                             desc='Object name embedding generation'):
         image_id = self.image_ix_to_id[str(image_index)]
         image_nodes = self.nodes[image_id]
         n_objects = self.image_n_objects[image_index]
         image_object_name_embeddings = np.zeros((n_objects, 300),
                                                 dtype='float32')
         for object_index in range(n_objects):
             image_object_name_embeddings[object_index] = word_embed(
                 image_nodes[object_index])
         with open(
                 os.path.join(self.dir['object_name_embeddings'],
                              '{}.p'.format(image_index)), 'wb') as f:
             pickle.dump(image_object_name_embeddings, f)
Ejemplo n.º 6
0
 def object_name_embedding_generation_h5(self):
     self.node_feature_h5.create_dataset("object_name_embeddings",
                                         (self.n_images, 36, 300),
                                         dtype='float32')
     word_embed = WordEmbedding('glove', self.word_emb_config)
     for image_index in tqdm(range(self.n_images),
                             unit='image',
                             desc='Object name embedding generation'):
         image_id = self.image_ix_to_id[str(image_index)]
         image_nodes = self.nodes[image_id]
         n_objects = self.image_n_objects[image_index]
         image_object_name_embeddings = np.zeros((36, 300), dtype='float32')
         for object_index in range(n_objects):
             image_object_name_embeddings[object_index] = word_embed(
                 image_nodes[object_index])
         self.node_feature_h5['object_name_embeddings'][image_index] = \
             image_object_name_embeddings
Ejemplo n.º 7
0
    def ocr_feature_generation(self):
        word_embed = WordEmbedding('fasttext', self.word_emb_config)

        for image_index in tqdm(range(self.n_images),
                                unit='image',
                                desc='Ocr feature generation'):
            image_id = self.image_ix_to_id[str(image_index)]
            image_ocr = self.ocr[image_id]
            n_ocr = len(image_ocr)
            image_ocr_token_embeddings = np.zeros((n_ocr, 300),
                                                  dtype='float32')
            image_ocr_bounding_boxes = np.zeros((n_ocr, 8), dtype='float32')
            for ocr_index, (ocr_token, bbox) in enumerate(image_ocr.items()):
                image_ocr_token_embeddings[ocr_index] = word_embed(ocr_token)
                image_ocr_bounding_boxes[ocr_index] = np.array(bbox).flatten()
            with open(
                    os.path.join(self.dir['ocr_token_embeddings'],
                                 '{}.p'.format(image_index)), 'wb') as f:
                pickle.dump(image_ocr_token_embeddings, f)
            with open(
                    os.path.join(self.dir['ocr_bounding_boxes'],
                                 '{}.p'.format(image_index)), 'wb') as f:
                pickle.dump(image_ocr_bounding_boxes, f)
Ejemplo n.º 8
0
    def __init__(self, opt):
        self.opt = opt

        w2v_path = {
            'word': '/data/word2vec/sgns.financial.word',
            'word_bigram': '/data/word2vec/sgns.financial.bigram',
            'char': '/data/word2vec/sgns.financial.char',
            'char_bigram': '/data/word2vec/sgns.financial.bigram-char'
        }
        embed = WordEmbedding(os.path.dirname(__file__) +
                              w2v_path[opt.vector_level],
                              initializer='avg')
        data_set = TextDataSet(os.path.dirname(__file__) +
                               '/data/single_2500_temp.csv',
                               embed,
                               max_seq_len=opt.max_seq_len,
                               vector_level=opt.vector_level)
        self.data_loader = DataLoader(dataset=data_set, batch_size=512)

        self.model = opt.model_class(embed.m, opt).to(opt.device)
        self.model.load_state_dict(
            torch.load(
                os.path.dirname(__file__) + '/best/' + opt.model_name +
                '_best.pkl')['state_dict'])
Ejemplo n.º 9
0
 def __init__(self, config):
     self.config = config
     self.word_embedding = WordEmbedding.WordEmbedding(config)
Ejemplo n.º 10
0
    def __init__(self,
                 path_to_node_info='data/node_information.csv',
                 path_to_training_set='data/training_set.txt',
                 path_to_test_set='data/testing_set.txt',
                 path_to_wv_model=None,
                 load_graph_dict=True):
        """The ComputeFeatures class enables the user to compute multiple features.
        Consider using the static `import_from_file` static function so as not to have to compute the dataframe every single time.
        :param path_to_node_info: the `node_information.csv` file from the kaggle competition
        :param path_to_training_set: the `training_set`
        :param path_to_wv_model: the path to the wv model, if you do not want to train it from scratch."""

        self.handled_variables = [
            "publication_2", "adam_coeff", "overlapping_words_in_title",
            "number_of_common_authors", "difference_of_years",
            "affinity_between_authors", "identical_journal", "l2_distance",
            "cosine_distance_tfid", "l2_distance_between_titles",
            "common_neighbors", "clustering_coeff", "betweenness", "closeness",
            "degree", "eigenvector", "jaccard_coeff", "shortest_path",
            "pagerank", "community", "lp_within_inter_cluster",
            "lp_ra_index_soundarajan", "lp_cn_soundarajan",
            "lp_preferential_attachment", "lp_resource_allocation_index"
        ]

        print("Loading node information...")
        self.node_information = pd.read_csv(
            path_to_node_info,
            names=['id', 'year', 'title', 'author', 'journal', 'abstract'])
        self.node_information = self.node_information.set_index('id')

        print("Loading train array...")
        self.train_array = np.loadtxt(path_to_training_set, dtype=int)
        self.nb_training_samples = self.train_array.shape[0]

        print("Loading test array...")
        self.test_array = np.loadtxt(path_to_test_set, dtype=int)
        self.nb_testing_samples = self.test_array.shape[0]

        # for tokenization
        print("Loading stemmer and stop words...")
        nltk.download('punkt')
        self.stemmer = nltk.stem.PorterStemmer()
        nltk.download('stopwords')
        self.stpwds = set(nltk.corpus.stopwords.words("english"))

        print("TfidVectorizer...")
        training_words = list(self.node_information['abstract'])
        vectorizer = TfidfVectorizer(analyzer='word',
                                     ngram_range=(1, 3),
                                     min_df=0,
                                     stop_words="english")
        features_tfid = vectorizer.fit_transform(training_words)
        self.node_information["wv_tfid"] = pd.Series(
            [x for x in features_tfid])

        print("Creating word embeddings...")
        self.wv = WordEmbedding(self.stemmer, self.stpwds)
        if path_to_wv_model is None:
            print("Training wv model with standard params...")
            self.wv.train_model(self.node_information)
        else:
            print("Loading wv model from %s" % path_to_wv_model)
            self.wv.load_model(path_to_wv_model)

        # Create publication column
        print("Creating publication column...")
        self.node_information['publication'] = self.node_information.apply(
            lambda row: [], axis=1)
        for t in self.train_array:
            if t[2] == 1:
                self.node_information.loc[t[0], 'publication'].append(t[1])
                self.node_information.loc[t[1], 'publication'].append(t[0])

        # Create publication_II column
        print("Creating publication II column...")
        self.node_information['publication_2'] = self.node_information.apply(
            lambda row: list_of_publication_2(row, self.node_information),
            axis=1)

        # Authors dict
        print("Creating authors dictionary...")
        authors_list = []
        self.node_information['author'].apply(
            lambda row: stack_lists(row, authors_list))
        authors_list = [
            auth for auth in authors_list
            if auth not in ['', '&', "(", ")"] and len(auth) > 2
        ]
        self.authors_dict = dict(
            (auth, []) for auth in np.unique(authors_list))
        del authors_list
        self.node_information['author'].apply(
            lambda row: put_authors_in_dict(row, self.authors_dict))
        for k in self.authors_dict.keys():
            while k in self.authors_dict[k]:
                self.authors_dict[k].remove(k)
            while '' in self.authors_dict[k]:
                self.authors_dict[k].remove('')

        # Feature vector for the abstract
        print("Making feature vectors for the abstract...")
        self.node_information['wv'] = self.node_information.apply(
            lambda row: make_feature_vector(row.loc['abstract'], self.wv.model
                                            ),
            axis=1)

        print("Making feature vectors for the title...")
        self.node_information['title_wv'] = self.node_information.apply(
            lambda row: make_feature_vector(row.loc['title'], self.wv.model),
            axis=1)

        # Graph
        print("Making graph structure...")
        self.graph_structure = GraphStructure(self, load_graph_dict)

        self.abstract_feature_model = None