Beispiel #1
0
 def all_CR(self, infopath, extpath):
     CR = []
     file_list = os.listdir(infopath)
     for name in file_list:
         origin_info = FileUtil.readfile(filename=os.path.join(infopath, name))
         ext_info = FileUtil.readfile(filename=os.path.join(extpath, name))
         cr = self.CR(origin_info, ext_info)
         CR.append(cr)
     return CR
Beispiel #2
0
 def info(self, fi='', pagenum=100):
     info = FileUtil.readfile(fi)
     keywords = PreDeal.seg(info)
     # 1. 关键词提取
     keys = jieba.analyse.textrank(info,
                                   topK=10,
                                   withWeight=False,
                                   allowPOS=('ns', 'n', 'vn', 'v'))
     # 2. 调用搜索引擎爬取相关网页
     # 2.1 抓取链接
     spider_link = SpiderLink(keys, self.root)
     spider_link.crawl(pagenum)
     # 2.2 抓取内容
     filename = '_'.join(keys) + '.html'
     spider_to = SpiderTo(filename)
     spider_to.crawl()
     # 3. 文本预处理,去重,去停用词,分词,保留url和关键词集合
     p = PreDeal()
     filepath = os.path.join(config.spidertext, '_'.join(keys))
     prepath = os.path.join(config.prepapath, '_'.join(keys))
     p.savetexts(filepath=filepath, prepath=prepath)
     # 4. 构建索引, 并检索,得到包含关键词信息的网页
     # 4.1 索引构建
     indexpath = os.path.join(config.indexpath, '_'.join(keys))
     idx = Index()
     idx.build(datapath=prepath, indexpath=indexpath)
     search = Search1(filename=fi, pindexp=indexpath)
     # 4.2 搜索并保存
     info_k = keywords[:]
     num = search.retrieve(keywords=info_k)
     return keywords, num
Beispiel #3
0
 def get_url_titles(self):
     parse_list = []
     html_str = FileUtil.readfile(self.filename)
     linktr = etree.HTML(text=html_str).xpath('//tr')
     for item in linktr:
         url = item.xpath('string(./td[1])')
         title = item.xpath('string(./td[2])')
         parse_list.append(dict([('url', url), ('title', title)]))
     return parse_list
Beispiel #4
0
 def build(self, datapath, indexpath):
     self.logger.info('the process of create full-text index!')
     schema = Schema(title=TEXT(stored=True),
                     path=TEXT(stored=True),
                     content=TEXT(analyzer=SpaceSeparatedTokenizer()))
     if not os.path.exists(indexpath):  # 索引存储路径
         os.makedirs(indexpath)
     ix = create_in(indexpath, schema)  # 创建索引
     writer = ix.writer()
     for filename in os.listdir(datapath):
         filepath = os.path.join(datapath, filename)
         content = FileUtil.readfile(filepath)
         writer.add_document(path=filepath, title=filename, content=content)
     writer.commit()
Beispiel #5
0
class PreDeal(object):
    """
提取中文字符,去停用词,分词,保存
    """

    stopwords = FileUtil.readfile(config.stopwordpath).splitlines()

    def __init__(self):
        logging.basicConfig(
            level=logging.INFO,
            format=
            '%(asctime)s - %(name)s - %(levelname)s - %(module)s - %(message)s'
        )
        self.logger = logging.getLogger("Text manipulation")

    @classmethod
    def seg(cls, sentences):
        sentences = cls._renostr(sentences)
        kws = []
        stopwords = cls.stopwords
        # departs = jieba.cut_for_search(sentences, HMM=True)  # 搜索引擎模式分词
        departs = jieba.cut(sentences, HMM=True)  # 普通模式
        for word in departs:
            if word not in stopwords:  # 去停用词
                kws.append(word)
        return kws

    @staticmethod
    def _renostr(strings):
        """
        提取所有汉字
        :param strings:
        :return:
        """
        pattern = re.compile('[\u4e00-\u9fa5]+')
        strs = re.findall(pattern, strings)
        return ''.join(strs)

    def savetexts(self, filepath, prepath):
        """
        保存预处理后的文本
        :param filepath: html文件路径
        :param prepath:  保存路径
        :return:
        """
        self.logger.info('init pretreatment directory:"{0}"'.format(prepath))
        FileUtil.init_path(prepath)
        try:
            file_lists = os.listdir(filepath)  # 返回当前路径下所有文件和路径,字符串类型
            for filename in file_lists:
                file = os.path.join(filepath, filename)
                if os.path.isfile(file):
                    # 1.获取url及文本
                    url, text = FileUtil.get_url_text(file)
                    # 2.关键词信息
                    kws = PreDeal.seg(text)
                    self.logger.info(
                        "Store pretreatment texts content:{0}".format(
                            filename))
                    FileUtil.writefile(url + '\t'.join(kws),
                                       os.path.join(prepath, filename))
            self.logger.info('Text pretreatment End!')
        except Exception as e:
            print(e)