Beispiel #1
0
 def extract_information(self, train_instances):
     seqs = []
     for train_instance in train_instances:
         sa, sb = train_instance.get_word(type=self.type, lower=True)
         seqs.append(sa)
         seqs.append(sb)
     self.idf_weight = utils.idf_calculator(seqs)
 def extract_information(self, train_instances):
     if self.is_training:
         sents = []
         for train_instance in train_instances:
             warrant0, warrant1, reason, claim, title, info = train_instance.get_six(
                 type='word')
             sents.append(warrant0)
             sents.append(warrant1)
             sents.append(reason)
             sents.append(claim)
         idf_dict = utils.idf_calculator(sents)
         # idf_dict = sorted(idf_dict.items(), key=lambda x: x[1], reverse=True)
         with utils.create_write_file(config.RESOURCE_DIR +
                                      '/idf_dict.txt') as fw:
             for key in idf_dict:
                 print('{}\t{}'.format(key, idf_dict[key]), file=fw)
         print(len(idf_dict))
     else:
         with utils.create_read_file(config.RESOURCE_DIR +
                                     '/idf_dict.txt') as fr:
             idf_dict = {}
             for line in fr:
                 line = line.strip().split('\t')
                 idf_dict[line[0]] = float(line[1])
     self.unigram_dict = idf_dict
 def extract_information(self, train_instances):
     seqs = []
     for train_instance in train_instances:
         dep_sa, dep_sb = train_instance.get_dependency()
         seqs.append(dep_sa)
         seqs.append(dep_sb)
     self.idf_weight = utils.idf_calculator(seqs)
Beispiel #4
0
 def extract_information(self, train_instances):
     seqs = []
     for train_instance in train_instances:
         pos_sa, pos_sb = train_instance.get_word(type='pos',
                                                  stopwords=True)
         seqs.append(pos_sa)
         seqs.append(pos_sb)
     self.idf_weight = utils.idf_calculator(seqs)  # idf weight is different
Beispiel #5
0
    def extract_information(self, train_instances):
        seqs = []
        for train_instance in train_instances:
            lemma_sa, lemma_sb = train_instance.get_word(
                type='lemma', stopwords=self.stopwords, lower=True)
            seqs.append(lemma_sa)
            seqs.append(lemma_sb)

        self.idf_weight = utils.idf_calculator(seqs)
Beispiel #6
0
    def extract_information(self, train_instances):
        seqs = []
        for train_instance in train_instances:
            sent = train_instance.get_sent(self.type)
            seqs.append(sent)

        self.idf_weight = utils.idf_calculator(seqs)
        self.word2index = {word:index for index, word in enumerate(self.idf_weight.keys())}
        self.embeddings = utils.load_word_embedding(self.word2index, self.emb_file, self.dim, self.binary)
 def extract_information(self, train_instances):
     seqs = []
     for train_instance in train_instances:
         dep_sa, dep_sb = train_instance.get_dependency()
         dep_sa = [(dep[1], dep[2]) for dep in dep_sa]
         dep_sb = [(dep[1], dep[2]) for dep in dep_sb]
         seqs.append(dep_sa)
         seqs.append(dep_sb)
     self.idf_weight = utils.idf_calculator(seqs)
     self.vocab = utils.word2index(self.idf_weight)
 def extract_information(self, train_instances):
     seqs = []
     for train_instance in train_instances:
         word_sa, word_sb = train_instance.get_word(
             type=self.word_type,
             stopwords=self.stopwords,
             lower=self.lower)
         seqs.append(word_sa)
         seqs.append(word_sb)
     self.idf_weight = utils.idf_calculator(seqs)
     self.vocab = utils.word2index(self.idf_weight)
     self.vocab, self.embeddings = utils.load_word_embedding(
         self.vocab, self.emb_file)
Beispiel #9
0
    def extract_information(self, train_instances):
        seqs = []
        for train_instance in train_instances:
            pos_sa, pos_sb = train_instance.get_pos_tag(stopwords=False)
            sa = [w for w, tag in pos_sa if tag == 'n']
            sb = [w for w, tag in pos_sb if tag == 'n']
            seqs.append(sa)
            seqs.append(sb)

        idf_weight = utils.idf_calculator(seqs)
        vocab = utils.word2index(idf_weight)
        self.idf_weight = idf_weight
        self.vocab, self.embeddings = utils.load_word_embedding(
            vocab, self.emb_file)
Beispiel #10
0
    def extract_information(self, train_instances):
        seqs = []
        for train_instance in train_instances:
            pos_sa, pos_sb = train_instance.get_pos_tag(stopwords=False)
            sa = [w for w, tag in pos_sa if tag == 'n']
            sb = [w for w, tag in pos_sb if tag == 'n']
            seqs.append(sa)
            seqs.append(sb)

        self.idf_weight = utils.idf_calculator(seqs)
        self.word2index = {
            word: index
            for index, word in enumerate(self.idf_weight.keys())
        }
        self.embeddings = utils.load_word_embedding(self.word2index,
                                                    self.emb_file, self.dim,
                                                    self.binary)
Beispiel #11
0
    def extract_information(self, train_instances):
        seqs = []
        for train_instance in train_instances:
            word_sa, word_sb = train_instance.get_word(
                type=self.word_type,
                stopwords=self.stopwords,
                lower=self.lower)
            seqs.append(word_sa)
            seqs.append(word_sb)

        self.idf_weight = utils.idf_calculator(seqs)
        self.word2index = {
            word: index
            for index, word in enumerate(self.idf_weight.keys())
        }
        self.embeddings = utils.load_word_embedding(self.word2index,
                                                    self.emb_file, self.dim,
                                                    self.binary)
Beispiel #12
0
 def extract_information(self, train_instances):
     if self.is_training:
         sents = []
         for train_instance in train_instances:
             sent = train_instance.get_sent(self.type)
             sents.append(sent)
         idf_dict = utils.idf_calculator(sents)
         with utils.create_write_file(config.DICTIONARY_DIR + '/{}_idf_dict.txt'.format(self.type)) as fw:
             idf_dict_tuple = sorted(idf_dict.items(), key=lambda x: x[1], reverse=True)
             for key, value in idf_dict_tuple:
                 print('{}\t{}'.format(key, value), file=fw)
     else:
         with utils.create_read_file(config.DICTIONARY_DIR + '/{}_idf_dict.txt'.format(self.type)) as fr:
             idf_dict = {}
             for line in fr:
                 line = line.strip().split('\t')
                 idf_dict[line[0]] = float(line[1])
     self.unigram_dict = idf_dict
     word_keys = sorted(idf_dict.keys(), reverse=True)
     self.word2index = {word: i for i, word in enumerate(word_keys)}
    def extract(self, train_instance):
        sa, sb = train_instance.get_preprocess()
        # sa, sb = train_instance.get_word(type='lemma', stopwords=True, lower=True)

        la, lb = len(sa), len(sb)
        l = min(la, lb)

        features = []
        feature, info = utils.sentence_sequence_features(sa, sb)
        features += feature

        feature, info = utils.sentence_match_features(sa, sb)
        features += feature

        bow = utils.idf_calculator([sa, sb])
        feature, info = utils.sentence_vectorize_features(sa,
                                                          sb,
                                                          bow,
                                                          convey='count')
        features += feature
        infos = [sa, sb]
        return features, infos
Beispiel #14
0
    def extract_information(self, train_instances):
        if self.is_training:
            sents = []
            for train_instance in train_instances:
                sent = train_instance.get_word()
                sents.append(sent)
            idf_dict = utils.idf_calculator(sents)

            #idf_dict = sorted(idf_dict.iteritems(), key=lambda x: x[1], reverse=True)

            with utils.create_write_file(config.DICTIONARY_DIR + '/idf_dict.txt') as fw:
                for key in idf_dict:
                    print('{}\t{}'.format(key, idf_dict[key]), file=fw)

            print(len(idf_dict))
        else:
            with utils.create_read_file(config.DICTIONARY_DIR + '/idf_dict.txt') as fr:
                idf_dict = {}
                for line in fr:
                    line = line.strip().split('\t')
                    idf_dict[line[0]] = float(line[1])

        self.unigram_dict = idf_dict
Beispiel #15
0
 def create_global_idf(self, file_list):
     print('\n'.join(file_list))
     sentences, _ = stst.load_sentences(file_list)
     print(sentences[:5])
     global_idf = utils.idf_calculator(sentences)
     return global_idf