def __init__(self, df_list=None): train_words_clean_file = cfg.DATA_PATH + 'train_words_clean.csv' test_words_clean_file = cfg.DATA_PATH + 'test_words_clean.csv' if not df_list: self.df_list = [ load_csv(train_words_clean_file), load_csv(test_words_clean_file) ]
def __init__(self, df_list=None): self.idf_file = cfg.DATA_PATH + 'idf.txt' # idf文档 self.idf = {} # 统计该词的 idf self.df_list = df_list if not df_list: self.df_list = [ load_csv(cfg.DATA_PATH + 'train_words.csv'), load_csv(cfg.DATA_PATH + 'test_words.csv') ]
def __init__(self, df_list=None): """ :param filename_list: 文件名列表 """ train_words_file = cfg.DATA_PATH + 'train_new.csv' test_words_file = cfg.DATA_PATH + 'test_new.csv' if not df_list: self.df_list = [ load_csv(train_words_file), load_csv(test_words_file) ]
def extract_tags_all(self, df_list=None, head_topK=6, content_topK=100, TF=False, withWeight=False): if not df_list: df_list = [ load_csv(self.train_words_clean_file), load_csv(self.test_words_clean_file) ] for i, df in enumerate(df_list): if TF: for w in self.idf: self.idf[w] = 1 fw = codecs.open(self.tags_file_list[i], 'w', encoding='utf-8') for n in range(df.shape[0]): tags_head = [] tags_content = [] try: tags_head = self.extract_tags(df.iloc[n]['head'].split(), topK=head_topK, withWeight=withWeight) except: print('%s head is nan' % n) while len(tags_head) < head_topK: tags_head.append('<PAD_HEAD>') try: tags_content = self.extract_tags( df.iloc[n]['content'].split(), topK=content_topK, withWeight=withWeight) except: print('%s content is nan' % n) while len(tags_content) < content_topK: tags_content.append('<PAD_CONTENT>') fw.write('%s\t%s\t%s\t%s\n' % (df.iloc[n]['id'], ' '.join(tags_head), ' '.join(tags_content), df.iloc[n]['label'])) fw.close()
def __init__(self, train_df=None): # self.train_words_file = cfg.DATA_PATH + 'train_words.csv' self.train_words_file = cfg.DATA_PATH + 'train_tags.csv' self.chi_file = cfg.DATA_PATH + 'chi.txt' self.chi = {} self.pos = {} self.neg = {} self.train_df = train_df if not train_df: self.train_df = load_csv(self.train_words_file)
def extract_train_tags(self, train_df=None, head_topK=6, content_topK=50, TF=False, withWeight=False): """ 提取关键词 :param train_df: 训练集数据框 :param head_topK: 要从标题中提取的关键词数 :param content_topK: 要从内容中提取的关键词数 -> train_tags_pos.csv train_tags_neg.csv """ if TF: for w in self.idf: self.idf[w] = 1 if not train_df: train_df = load_csv(self.train_words_clean_file) fw_pos = codecs.open(self.train_tags_pos_file, 'w', encoding='utf-8') fw_neg = codecs.open(self.train_tags_neg_file, 'w', encoding='utf-8') for n in range(train_df.shape[0]): tags_head = [] tags_content = [] try: tags_head.extend( self.extract_tags(train_df.iloc[n]['head'].split(), topK=head_topK, withWeight=withWeight)) except: print('%s head is nan' % n) while len(tags_head) < head_topK: tags_head.append('<PAD_HEAD>') try: tags_content.extend( self.extract_tags(train_df.iloc[n]['content'].split(), topK=content_topK, withWeight=withWeight)) except: print('%s content is nan' % n) while len(tags_content) < content_topK: tags_content.append('<PAD_CONTENT>') tags = tags_head + tags_content if train_df.iloc[n]['label'] == 'POSITIVE': fw_pos.write(' '.join(tags) + '\n') else: fw_neg.write(' '.join(tags) + '\n') fw_pos.close() fw_neg.close() print('extract train tags done')
tf.flags.DEFINE_integer('num_checkpoints', 5, 'Number of checkpoints to store') # Misc Parameters tf.flags.DEFINE_boolean('allow_soft_placement', True, 'Allow device soft device placement') tf.flags.DEFINE_boolean('log_device_placement', False, 'Log placement of ops on devices') FLAGS = tf.flags.FLAGS FLAGS._parse_flags() print('\nParameters:') for attr, value in sorted(FLAGS.__flags.items()): print('{}={}'.format(attr.upper(), value)) print('=' * 120) words_df = load_csv(FLAGS.train_words_file)[:10000] words_df = words_df.sample(frac=1) # 打乱 TRAIN_WORDS_DF = words_df[0:int(words_df.shape[0] * (1 - FLAGS.dev_sample_percentage))] EVL_WORDS_DF = words_df[int(words_df.shape[0] * (1 - FLAGS.dev_sample_percentage)):] print('训练集和验证集总样例数:', words_df.shape[0]) print('训练集样例数:', TRAIN_WORDS_DF.shape[0]) print('测试集样例数:', EVL_WORDS_DF.shape[0]) print('=' * 120) w2vm = W2VModelManager() w2v = w2vm.load_model(FLAGS.w2v_model) print('word2vec 模型信息:', w2v)