Exemple #1
0
 def __init__(self, df_list=None):
     train_words_clean_file = cfg.DATA_PATH + 'train_words_clean.csv'
     test_words_clean_file = cfg.DATA_PATH + 'test_words_clean.csv'
     if not df_list:
         self.df_list = [
             load_csv(train_words_clean_file),
             load_csv(test_words_clean_file)
         ]
Exemple #2
0
    def __init__(self, df_list=None):

        self.idf_file = cfg.DATA_PATH + 'idf.txt'  # idf文档
        self.idf = {}  # 统计该词的 idf

        self.df_list = df_list
        if not df_list:
            self.df_list = [
                load_csv(cfg.DATA_PATH + 'train_words.csv'),
                load_csv(cfg.DATA_PATH + 'test_words.csv')
            ]
Exemple #3
0
 def __init__(self, df_list=None):
     """
     :param filename_list: 文件名列表 
     """
     train_words_file = cfg.DATA_PATH + 'train_new.csv'
     test_words_file = cfg.DATA_PATH + 'test_new.csv'
     if not df_list:
         self.df_list = [
             load_csv(train_words_file),
             load_csv(test_words_file)
         ]
Exemple #4
0
    def extract_tags_all(self,
                         df_list=None,
                         head_topK=6,
                         content_topK=100,
                         TF=False,
                         withWeight=False):
        if not df_list:
            df_list = [
                load_csv(self.train_words_clean_file),
                load_csv(self.test_words_clean_file)
            ]
        for i, df in enumerate(df_list):
            if TF:
                for w in self.idf:
                    self.idf[w] = 1
            fw = codecs.open(self.tags_file_list[i], 'w', encoding='utf-8')
            for n in range(df.shape[0]):
                tags_head = []
                tags_content = []
                try:
                    tags_head = self.extract_tags(df.iloc[n]['head'].split(),
                                                  topK=head_topK,
                                                  withWeight=withWeight)
                except:
                    print('%s head is nan' % n)
                while len(tags_head) < head_topK:
                    tags_head.append('<PAD_HEAD>')
                try:
                    tags_content = self.extract_tags(
                        df.iloc[n]['content'].split(),
                        topK=content_topK,
                        withWeight=withWeight)
                except:
                    print('%s content is nan' % n)
                while len(tags_content) < content_topK:
                    tags_content.append('<PAD_CONTENT>')

                fw.write('%s\t%s\t%s\t%s\n' %
                         (df.iloc[n]['id'], ' '.join(tags_head),
                          ' '.join(tags_content), df.iloc[n]['label']))
            fw.close()
Exemple #5
0
    def __init__(self, train_df=None):

        # self.train_words_file = cfg.DATA_PATH + 'train_words.csv'
        self.train_words_file = cfg.DATA_PATH + 'train_tags.csv'
        self.chi_file = cfg.DATA_PATH + 'chi.txt'
        self.chi = {}
        self.pos = {}
        self.neg = {}

        self.train_df = train_df
        if not train_df:
            self.train_df = load_csv(self.train_words_file)
Exemple #6
0
    def extract_train_tags(self,
                           train_df=None,
                           head_topK=6,
                           content_topK=50,
                           TF=False,
                           withWeight=False):
        """
        提取关键词
        :param train_df: 训练集数据框
        :param head_topK: 要从标题中提取的关键词数
        :param content_topK: 要从内容中提取的关键词数
            -> train_tags_pos.csv train_tags_neg.csv 
        """
        if TF:
            for w in self.idf:
                self.idf[w] = 1
        if not train_df:
            train_df = load_csv(self.train_words_clean_file)
        fw_pos = codecs.open(self.train_tags_pos_file, 'w', encoding='utf-8')
        fw_neg = codecs.open(self.train_tags_neg_file, 'w', encoding='utf-8')
        for n in range(train_df.shape[0]):
            tags_head = []
            tags_content = []
            try:
                tags_head.extend(
                    self.extract_tags(train_df.iloc[n]['head'].split(),
                                      topK=head_topK,
                                      withWeight=withWeight))
            except:
                print('%s head is nan' % n)
            while len(tags_head) < head_topK:
                tags_head.append('<PAD_HEAD>')
            try:
                tags_content.extend(
                    self.extract_tags(train_df.iloc[n]['content'].split(),
                                      topK=content_topK,
                                      withWeight=withWeight))
            except:
                print('%s content is nan' % n)
            while len(tags_content) < content_topK:
                tags_content.append('<PAD_CONTENT>')
            tags = tags_head + tags_content

            if train_df.iloc[n]['label'] == 'POSITIVE':
                fw_pos.write(' '.join(tags) + '\n')
            else:
                fw_neg.write(' '.join(tags) + '\n')
        fw_pos.close()
        fw_neg.close()
        print('extract train tags done')
Exemple #7
0
tf.flags.DEFINE_integer('num_checkpoints', 5, 'Number of checkpoints to store')

# Misc Parameters
tf.flags.DEFINE_boolean('allow_soft_placement', True,
                        'Allow device soft device placement')
tf.flags.DEFINE_boolean('log_device_placement', False,
                        'Log placement of ops on devices')

FLAGS = tf.flags.FLAGS
FLAGS._parse_flags()
print('\nParameters:')
for attr, value in sorted(FLAGS.__flags.items()):
    print('{}={}'.format(attr.upper(), value))
print('=' * 120)

words_df = load_csv(FLAGS.train_words_file)[:10000]
words_df = words_df.sample(frac=1)  # 打乱
TRAIN_WORDS_DF = words_df[0:int(words_df.shape[0] *
                                (1 - FLAGS.dev_sample_percentage))]
EVL_WORDS_DF = words_df[int(words_df.shape[0] *
                            (1 - FLAGS.dev_sample_percentage)):]
print('训练集和验证集总样例数:', words_df.shape[0])
print('训练集样例数:', TRAIN_WORDS_DF.shape[0])
print('测试集样例数:', EVL_WORDS_DF.shape[0])
print('=' * 120)

w2vm = W2VModelManager()
w2v = w2vm.load_model(FLAGS.w2v_model)
print('word2vec 模型信息:', w2v)