def extract_information(self, train_instances): seqs = [] for train_instance in train_instances: sa, sb = train_instance.get_word(type=self.type, lower=True) seqs.append(sa) seqs.append(sb) self.idf_weight = utils.idf_calculator(seqs)
def extract_information(self, train_instances): if self.is_training: sents = [] for train_instance in train_instances: warrant0, warrant1, reason, claim, title, info = train_instance.get_six( type='word') sents.append(warrant0) sents.append(warrant1) sents.append(reason) sents.append(claim) idf_dict = utils.idf_calculator(sents) # idf_dict = sorted(idf_dict.items(), key=lambda x: x[1], reverse=True) with utils.create_write_file(config.RESOURCE_DIR + '/idf_dict.txt') as fw: for key in idf_dict: print('{}\t{}'.format(key, idf_dict[key]), file=fw) print(len(idf_dict)) else: with utils.create_read_file(config.RESOURCE_DIR + '/idf_dict.txt') as fr: idf_dict = {} for line in fr: line = line.strip().split('\t') idf_dict[line[0]] = float(line[1]) self.unigram_dict = idf_dict
def extract_information(self, train_instances): seqs = [] for train_instance in train_instances: dep_sa, dep_sb = train_instance.get_dependency() seqs.append(dep_sa) seqs.append(dep_sb) self.idf_weight = utils.idf_calculator(seqs)
def extract_information(self, train_instances): seqs = [] for train_instance in train_instances: pos_sa, pos_sb = train_instance.get_word(type='pos', stopwords=True) seqs.append(pos_sa) seqs.append(pos_sb) self.idf_weight = utils.idf_calculator(seqs) # idf weight is different
def extract_information(self, train_instances): seqs = [] for train_instance in train_instances: lemma_sa, lemma_sb = train_instance.get_word( type='lemma', stopwords=self.stopwords, lower=True) seqs.append(lemma_sa) seqs.append(lemma_sb) self.idf_weight = utils.idf_calculator(seqs)
def extract_information(self, train_instances): seqs = [] for train_instance in train_instances: sent = train_instance.get_sent(self.type) seqs.append(sent) self.idf_weight = utils.idf_calculator(seqs) self.word2index = {word:index for index, word in enumerate(self.idf_weight.keys())} self.embeddings = utils.load_word_embedding(self.word2index, self.emb_file, self.dim, self.binary)
def extract_information(self, train_instances): seqs = [] for train_instance in train_instances: dep_sa, dep_sb = train_instance.get_dependency() dep_sa = [(dep[1], dep[2]) for dep in dep_sa] dep_sb = [(dep[1], dep[2]) for dep in dep_sb] seqs.append(dep_sa) seqs.append(dep_sb) self.idf_weight = utils.idf_calculator(seqs) self.vocab = utils.word2index(self.idf_weight)
def extract_information(self, train_instances): seqs = [] for train_instance in train_instances: word_sa, word_sb = train_instance.get_word( type=self.word_type, stopwords=self.stopwords, lower=self.lower) seqs.append(word_sa) seqs.append(word_sb) self.idf_weight = utils.idf_calculator(seqs) self.vocab = utils.word2index(self.idf_weight) self.vocab, self.embeddings = utils.load_word_embedding( self.vocab, self.emb_file)
def extract_information(self, train_instances): seqs = [] for train_instance in train_instances: pos_sa, pos_sb = train_instance.get_pos_tag(stopwords=False) sa = [w for w, tag in pos_sa if tag == 'n'] sb = [w for w, tag in pos_sb if tag == 'n'] seqs.append(sa) seqs.append(sb) idf_weight = utils.idf_calculator(seqs) vocab = utils.word2index(idf_weight) self.idf_weight = idf_weight self.vocab, self.embeddings = utils.load_word_embedding( vocab, self.emb_file)
def extract_information(self, train_instances): seqs = [] for train_instance in train_instances: pos_sa, pos_sb = train_instance.get_pos_tag(stopwords=False) sa = [w for w, tag in pos_sa if tag == 'n'] sb = [w for w, tag in pos_sb if tag == 'n'] seqs.append(sa) seqs.append(sb) self.idf_weight = utils.idf_calculator(seqs) self.word2index = { word: index for index, word in enumerate(self.idf_weight.keys()) } self.embeddings = utils.load_word_embedding(self.word2index, self.emb_file, self.dim, self.binary)
def extract_information(self, train_instances): seqs = [] for train_instance in train_instances: word_sa, word_sb = train_instance.get_word( type=self.word_type, stopwords=self.stopwords, lower=self.lower) seqs.append(word_sa) seqs.append(word_sb) self.idf_weight = utils.idf_calculator(seqs) self.word2index = { word: index for index, word in enumerate(self.idf_weight.keys()) } self.embeddings = utils.load_word_embedding(self.word2index, self.emb_file, self.dim, self.binary)
def extract_information(self, train_instances): if self.is_training: sents = [] for train_instance in train_instances: sent = train_instance.get_sent(self.type) sents.append(sent) idf_dict = utils.idf_calculator(sents) with utils.create_write_file(config.DICTIONARY_DIR + '/{}_idf_dict.txt'.format(self.type)) as fw: idf_dict_tuple = sorted(idf_dict.items(), key=lambda x: x[1], reverse=True) for key, value in idf_dict_tuple: print('{}\t{}'.format(key, value), file=fw) else: with utils.create_read_file(config.DICTIONARY_DIR + '/{}_idf_dict.txt'.format(self.type)) as fr: idf_dict = {} for line in fr: line = line.strip().split('\t') idf_dict[line[0]] = float(line[1]) self.unigram_dict = idf_dict word_keys = sorted(idf_dict.keys(), reverse=True) self.word2index = {word: i for i, word in enumerate(word_keys)}
def extract(self, train_instance): sa, sb = train_instance.get_preprocess() # sa, sb = train_instance.get_word(type='lemma', stopwords=True, lower=True) la, lb = len(sa), len(sb) l = min(la, lb) features = [] feature, info = utils.sentence_sequence_features(sa, sb) features += feature feature, info = utils.sentence_match_features(sa, sb) features += feature bow = utils.idf_calculator([sa, sb]) feature, info = utils.sentence_vectorize_features(sa, sb, bow, convey='count') features += feature infos = [sa, sb] return features, infos
def extract_information(self, train_instances): if self.is_training: sents = [] for train_instance in train_instances: sent = train_instance.get_word() sents.append(sent) idf_dict = utils.idf_calculator(sents) #idf_dict = sorted(idf_dict.iteritems(), key=lambda x: x[1], reverse=True) with utils.create_write_file(config.DICTIONARY_DIR + '/idf_dict.txt') as fw: for key in idf_dict: print('{}\t{}'.format(key, idf_dict[key]), file=fw) print(len(idf_dict)) else: with utils.create_read_file(config.DICTIONARY_DIR + '/idf_dict.txt') as fr: idf_dict = {} for line in fr: line = line.strip().split('\t') idf_dict[line[0]] = float(line[1]) self.unigram_dict = idf_dict
def create_global_idf(self, file_list): print('\n'.join(file_list)) sentences, _ = stst.load_sentences(file_list) print(sentences[:5]) global_idf = utils.idf_calculator(sentences) return global_idf