Esempio n. 1
0
    def save_url_articl(self):
        log_print('Saving data file in: '.format(self.work_dir))
        assert self.work_dir != '', 'Data file dir can not be none!'
        if not os.path.exists(self.work_dir):
            os.makedirs(self.work_dir)

        url_article_file = os.path.join(self.work_dir, 'url_article.pkl')
        if os.path.exists(url_article_file):
            logging.warn(
                'Data file: \'{}\' already exist and it will be replaced!'.
                format(url_article_file))

        with open(url_article_file, 'wb') as fout:
            result = {
                'url_dict': self.url_dict,
                'article_dict': self.article_dict,
                # 'word_dict': self.word_dict, 'simultaneous_dict': self.simultaneous_dict
            }
            pickle.dump(result, fout)

        with open(os.path.join(self.work_dir, 'url.txt'),
                  'wt',
                  encoding='utf-8') as fout:
            fout.write('# article number ' + '#' * 30 + '\n')
            _urls = set()
            for k, v in self.url_dict.items():
                fout.write('{}: {}\n'.format(k, len(v)))
                _urls = _urls.union(v)
            fout.write('no repeat article number: {}\n'.format(len(_urls)))
            fout.write('\n')
            for k, v in self.url_dict.items():
                fout.write('# {} '.format(k) + '#' * 30 + '\n')
                for i, _url in enumerate(v):
                    fout.write('{}. {}\n'.format(i + 1, _url))
            fout.write('\n')

        with open(os.path.join(self.work_dir, 'article.txt'),
                  'wt',
                  encoding='utf-8') as fout:
            # title read like time keywords abstract text segmentation
            for i, (k, v) in enumerate(self.article_dict.items()):
                fout.write('# {} '.format(i + 1) + '#' * 30 + '\n')
                fout.write('url: {}\n'.format(k))
                fout.write('title: {}\n'.format(v[0]))
                fout.write('read: {}\n'.format(v[1]))
                fout.write('like: {}\n'.format(v[2]))
                fout.write('time: {}\n'.format(v[3]))
                fout.write('keywords: {}\n'.format(v[4]))
                fout.write('abstract: {}\n'.format(v[5]))
                fout.write('text: {}\n'.format(v[6]))
                fout.write('segmentation: {}\n\n'.format(v[7]))
Esempio n. 2
0
    def load_url_article(self, keywords):
        data_file = os.path.join(self.work_dir, 'url_article.pkl')
        if self.work_dir != '' and os.path.exists(data_file):
            log_print('Loading data file:{}'.format(data_file))

            with open(data_file, 'rb') as fin:
                result = pickle.load(fin)
                self.url_dict = result['url_dict']
                self.article_dict = result['article_dict']
            missing_keys = [
                keyword for keyword in keywords
                if keyword not in self.url_dict.keys()
            ]
            if len(missing_keys) > 0:
                return False, missing_keys
            else:
                return True, None
        else:
            return False, keywords
Esempio n. 3
0
def wait_for_elem(xpath, time_out_internals=[60, 120, 180]):
    time_out_nums = 0
    while True:
        try:
            data_loader.DataLoader.WAIT.until(
                lambda brows: brows.find_element_by_xpath(xpath))
            time.sleep(random.randint(1, 5))
            return True, 1
        except TimeoutException:
            if time_out_nums >= 3:
                return False, -1
            else:
                time_out_tips(time_out_internals[time_out_nums])
                time_out_nums += 1
                continue
        except NoSuchElementException:
            return False, -2
        except Exception as msg:
            log_print(msg)
            return False, -3
Esempio n. 4
0
 def get_absts_page():
     # 获取url和标题
     urls, titles, absts = [], [], []
     if wait_for_elem(
             '//div[@class="search-list-con"]/dl[contains(@class, "search-list")]'
     ):
         try:
             # title read like time keywords abstract text segmentation
             node_list = DataLoader.BROWSER.find_elements_by_xpath(
                 '//div[@class="search-list-con"]/dl[contains(@class, "search-list")]'
             )
             for node in node_list:
                 try:
                     tmp1 = node.find_element_by_css_selector(
                         'dt > div.limit_width > a')
                     tmp2 = node.find_element_by_css_selector(
                         'dd.search-detail')
                     if DataLoader.FILTER.filter_when_getting(
                             keyword, tmp1.text, tmp2.text):
                         urls.append(tmp1.get_attribute('href'))
                         titles.append(tmp1.text)
                         absts.append(tmp2.text)
                 except selenium.common.exceptions.NoSuchElementException:
                     continue
         except selenium.common.exceptions.NoSuchElementException:
             log_print('NoSuchElementException!')
             return [], [], []
         except selenium.common.exceptions.TimeoutException:
             log_print('TimeoutException!')
             return [], [], []
         except Exception as msg:
             log_print(msg)
             return [], [], []
     return urls, titles, absts
Esempio n. 5
0
 def get_absts_page():
     # 获取url和标题
     urls, titles, absts = [], [], []
     if wait_for_elem(
             '//ul[@class="note-list"]/li/div[@class="content"]'):
         try:
             # title read like time keywords abstract text segmentation
             node_list = DataLoader.BROWSER.find_elements_by_xpath(
                 '//ul[@class="note-list"]/li/div[@class="content"]')
             for node in node_list:
                 tmp1 = node.find_element_by_css_selector('a.title')
                 tmp2 = node.find_element_by_css_selector('p.abstract')
                 if DataLoader.FILTER.filter_when_getting(
                         keyword, tmp1.text, tmp2.text):
                     urls.append(tmp1.get_attribute('href'))
                     titles.append(tmp1.text)
                     absts.append(tmp2.text)
         except selenium.common.exceptions.NoSuchElementException:
             log_print('NoSuchElementException!')
             return [], [], []
         except selenium.common.exceptions.TimeoutException:
             log_print('TimeoutException!')
             return [], [], []
         except Exception as msg:
             log_print(msg)
             return [], [], []
     return urls, titles, absts
Esempio n. 6
0
 def get_article(url):
     # 获取文章正文
     # article_dict: url-(title read like time keywords abstract text segmentation)
     DataLoader.BROWSER.get(url)
     log_print('Browsing url:{}...'.format(url))
     result = []
     if wait_for_elem('//article[@class="_2rhmJa"]'):
         try:
             title = DataLoader.BROWSER.find_element_by_xpath(
                 '//h1[@class="_1RuRku"]')
             article = DataLoader.BROWSER.find_element_by_xpath(
                 '//article[@class="_2rhmJa"]')
             dsoj = DataLoader.BROWSER.find_element_by_xpath(
                 '//div[@class="s-dsoj"]')
             art_time = dsoj.find_element_by_xpath('//time')
             read_nums = dsoj.find_element_by_xpath(
                 '//span[contains(text(), "阅读")]')
             like = DataLoader.BROWSER.find_element_by_xpath(
                 '//span[@class="_1LOh_5"]')
             # log_print('title:{}'.format(title.text))
             # log_print(art_time.text)
             # log_print(read_nums.text)
             # log_print(like.text)
         except selenium.common.exceptions.NoSuchElementException:
             log_print('NoSuchElementException!')
             return []
         except selenium.common.exceptions.TimeoutException:
             log_print('TimeoutException!')
             return []
         except Exception as msg:
             log_print(msg)
             return []
         result = [
             title.text,
             int(read_nums.text.replace(',', '').split(' ')[-1]),
             int(like.text.replace(',', '')[:-3]), art_time.text, [],
             '', article.text, None
         ]
     return result
Esempio n. 7
0
        def get_article(url):
            # 获取文章正文
            # article_dict: url-(title read like time keywords abstract text segmentation)
            DataLoader.BROWSER.get(url)
            log_print('Browsing url:{}...'.format(url))
            result = []
            read_nums_, like_ = 0, 0
            if wait_for_elem('//div[@id="content_views"]'):
                try:
                    article = DataLoader.BROWSER.find_element_by_xpath(
                        '//div[@id="content_views"]')
                    art_time = DataLoader.BROWSER.find_element_by_xpath(
                        '//div[@class="bar-content"]/span[@class="time"]')
                    read_nums = DataLoader.BROWSER.find_element_by_xpath(
                        '//div[@class="bar-content"]/span[@class="read-count"]'
                    )
                    like = DataLoader.BROWSER.find_element_by_xpath(
                        '//div[@class="bar-content"]/a/span[@class="get-collection"]'
                    )
                    log_print('article:{}'.format(article.text))
                    log_print(art_time.text)
                    log_print(read_nums.text)
                    log_print(like.text)
                except selenium.common.exceptions.NoSuchElementException:
                    log_print('NoSuchElementException!')
                    return []
                except selenium.common.exceptions.TimeoutException:
                    log_print('TimeoutException!')
                    return []
                except Exception as msg:
                    log_print(msg)
                    return []

                if read_nums.text.replace(' ', '') != '':
                    read_nums_ = int(read_nums.text.replace(' ', ''))
                if like.text.replace(' ', '') != '':
                    like_ = int(like.text.replace(' ', ''))
                result = [
                    '', read_nums_, like_, art_time.text, [], '', article.text,
                    None
                ]
            return result
Esempio n. 8
0
    def get_url_articl(self, keywords, pages=15):
        log_print('Geting urls from csdn...')
        # self.url_dict: keyword-[urls]
        base_url = 'https://so.csdn.net/so/search/s.do?p={}&q={}&t=blog&viparticle=&domain=&o=&s=&u=&l=&f=&rbg=0'

        def get_absts_page():
            # 获取url和标题
            urls, titles, absts = [], [], []
            if wait_for_elem(
                    '//div[@class="search-list-con"]/dl[contains(@class, "search-list")]'
            ):
                try:
                    # title read like time keywords abstract text segmentation
                    node_list = DataLoader.BROWSER.find_elements_by_xpath(
                        '//div[@class="search-list-con"]/dl[contains(@class, "search-list")]'
                    )
                    for node in node_list:
                        try:
                            tmp1 = node.find_element_by_css_selector(
                                'dt > div.limit_width > a')
                            tmp2 = node.find_element_by_css_selector(
                                'dd.search-detail')
                            if DataLoader.FILTER.filter_when_getting(
                                    keyword, tmp1.text, tmp2.text):
                                urls.append(tmp1.get_attribute('href'))
                                titles.append(tmp1.text)
                                absts.append(tmp2.text)
                        except selenium.common.exceptions.NoSuchElementException:
                            continue
                except selenium.common.exceptions.NoSuchElementException:
                    log_print('NoSuchElementException!')
                    return [], [], []
                except selenium.common.exceptions.TimeoutException:
                    log_print('TimeoutException!')
                    return [], [], []
                except Exception as msg:
                    log_print(msg)
                    return [], [], []
            return urls, titles, absts

        def get_article(url):
            # 获取文章正文
            # article_dict: url-(title read like time keywords abstract text segmentation)
            DataLoader.BROWSER.get(url)
            log_print('Browsing url:{}...'.format(url))
            result = []
            read_nums_, like_ = 0, 0
            if wait_for_elem('//div[@id="content_views"]'):
                try:
                    article = DataLoader.BROWSER.find_element_by_xpath(
                        '//div[@id="content_views"]')
                    art_time = DataLoader.BROWSER.find_element_by_xpath(
                        '//div[@class="bar-content"]/span[@class="time"]')
                    read_nums = DataLoader.BROWSER.find_element_by_xpath(
                        '//div[@class="bar-content"]/span[@class="read-count"]'
                    )
                    like = DataLoader.BROWSER.find_element_by_xpath(
                        '//div[@class="bar-content"]/a/span[@class="get-collection"]'
                    )
                    log_print('article:{}'.format(article.text))
                    log_print(art_time.text)
                    log_print(read_nums.text)
                    log_print(like.text)
                except selenium.common.exceptions.NoSuchElementException:
                    log_print('NoSuchElementException!')
                    return []
                except selenium.common.exceptions.TimeoutException:
                    log_print('TimeoutException!')
                    return []
                except Exception as msg:
                    log_print(msg)
                    return []

                if read_nums.text.replace(' ', '') != '':
                    read_nums_ = int(read_nums.text.replace(' ', ''))
                if like.text.replace(' ', '') != '':
                    like_ = int(like.text.replace(' ', ''))
                result = [
                    '', read_nums_, like_, art_time.text, [], '', article.text,
                    None
                ]
            return result

        try:
            for i, keyword in enumerate(keywords):
                self.url_dict[keyword] = set()
                for page in range(1, pages + 1):
                    log_print('keyword:{}, Searching page:{}...'.format(
                        keyword, page))
                    url = base_url.format(page, urllib.parse.quote(keyword))
                    # 发送请求
                    DataLoader.BROWSER.get(url)
                    # 页面滚动,直至出现”下一页“链接,或超时没出现,或本来就没有
                    # driver.execute_script('document.documentElement.scrollTop=5000')
                    # 获取当前页文章
                    urls, titles, absts = get_absts_page()
                    for url, title, abst in zip(urls, titles, absts):
                        if url not in self.article_dict.keys():
                            log_print(title)
                            result = get_article(url)
                            if len(result) > 0:
                                result[0] = title
                                self.article_dict[url] = result
                        if (url in self.article_dict) and (
                                url not in self.url_dict[keyword]):
                            self.url_dict[keyword].add(url)
                    if len(urls) < 5:
                        break
        except InterruptedError:
            log_print('Stop manually!')
        finally:
            self.save_url_articl()
Esempio n. 9
0
from knowledge_graph.concept_management.concept_manager import ConceptManager
from knowledge_graph.relation_management.relation_manager import RelationManager

from knowledge_graph.utils.log_utils import log_print, log_close
from knowledge_graph.utils.path_utils import create_work_dir

os.chdir(os.path.join(os.path.dirname(os.path.abspath(__file__)), '../'))

if __name__ == '__main__':
    # 创建工作区
    work_dir = create_work_dir('data/knowledge_graph/ex1/')

    # 1 获取语料
    data_manager = DataManager(work_dir)
    # 1.1 获取url、文章
    url_dict, article_dict = data_manager.get_url_article(['机器人故障'])
    # 1.2 统计词频,获取词典、两词语同时出现的次数统计
    word_dict, simultaneous_dict = data_manager.get_word_frequency()

    # # 2 获取概念
    # concept_manager = ConceptManager(work_dir)
    # concept_dict = concept_manager.get_concept(article_dict, word_dict, simultaneous_dict)
    #
    # # 3 关系抽取
    # relation_manager = RelationManager(work_dir)
    # relation_result = relation_manager.get_relation(concept_dict, article_dict)
    #
    log_print('Finish!')
    log_close()
    pass
Esempio n. 10
0
    def get_url_articl(self, keywords, pages=100):
        log_print('Geting urls from JianShu...')
        # self.url_dict: keyword-[urls]
        base_url = 'https://www.jianshu.com/search?q={}&page={}&type=note'

        def get_absts_page():
            # 获取url和标题
            urls, titles, absts = [], [], []
            if wait_for_elem(
                    '//ul[@class="note-list"]/li/div[@class="content"]'):
                try:
                    # title read like time keywords abstract text segmentation
                    node_list = DataLoader.BROWSER.find_elements_by_xpath(
                        '//ul[@class="note-list"]/li/div[@class="content"]')
                    for node in node_list:
                        tmp1 = node.find_element_by_css_selector('a.title')
                        tmp2 = node.find_element_by_css_selector('p.abstract')
                        if DataLoader.FILTER.filter_when_getting(
                                keyword, tmp1.text, tmp2.text):
                            urls.append(tmp1.get_attribute('href'))
                            titles.append(tmp1.text)
                            absts.append(tmp2.text)
                except selenium.common.exceptions.NoSuchElementException:
                    log_print('NoSuchElementException!')
                    return [], [], []
                except selenium.common.exceptions.TimeoutException:
                    log_print('TimeoutException!')
                    return [], [], []
                except Exception as msg:
                    log_print(msg)
                    return [], [], []
            return urls, titles, absts

        def get_article(url):
            # 获取文章正文
            # article_dict: url-(title read like time keywords abstract text segmentation)
            DataLoader.BROWSER.get(url)
            log_print('Browsing url:{}...'.format(url))
            result = []
            if wait_for_elem('//article[@class="_2rhmJa"]'):
                try:
                    title = DataLoader.BROWSER.find_element_by_xpath(
                        '//h1[@class="_1RuRku"]')
                    article = DataLoader.BROWSER.find_element_by_xpath(
                        '//article[@class="_2rhmJa"]')
                    dsoj = DataLoader.BROWSER.find_element_by_xpath(
                        '//div[@class="s-dsoj"]')
                    art_time = dsoj.find_element_by_xpath('//time')
                    read_nums = dsoj.find_element_by_xpath(
                        '//span[contains(text(), "阅读")]')
                    like = DataLoader.BROWSER.find_element_by_xpath(
                        '//span[@class="_1LOh_5"]')
                    # log_print('title:{}'.format(title.text))
                    # log_print(art_time.text)
                    # log_print(read_nums.text)
                    # log_print(like.text)
                except selenium.common.exceptions.NoSuchElementException:
                    log_print('NoSuchElementException!')
                    return []
                except selenium.common.exceptions.TimeoutException:
                    log_print('TimeoutException!')
                    return []
                except Exception as msg:
                    log_print(msg)
                    return []
                result = [
                    title.text,
                    int(read_nums.text.replace(',', '').split(' ')[-1]),
                    int(like.text.replace(',', '')[:-3]), art_time.text, [],
                    '', article.text, None
                ]
            return result

        for i, keyword in enumerate(keywords):
            self.url_dict[keyword] = set()
            for page in range(1, pages):
                log_print('keyword:{}, Searching page:{}...'.format(
                    keyword, page))
                url = base_url.format(urllib.parse.quote(keyword), page)
                # 发送请求
                DataLoader.BROWSER.get(url)
                # 页面滚动,直至出现”下一页“链接,或超时没出现,或本来就没有
                # driver.execute_script('document.documentElement.scrollTop=5000')
                # 获取当前页文章
                urls, titles, absts = get_absts_page()
                for url, title, abst in zip(urls, titles, absts):
                    if url not in self.article_dict.keys():
                        result = get_article(url)
                        if len(result) > 0:
                            # result[-3] = abst
                            self.article_dict[url] = result
                    if (url in self.article_dict) and (
                            url not in self.url_dict[keyword]):
                        self.url_dict[keyword].add(url)
                if len(urls) < 10:
                    break
        self.save_url_articl()
Esempio n. 11
0
    def get_word_frequency(self):
        # word_dict: word-(sk-keyword sk-tfidf time idf paper_keyword)
        # simultaneous_dict: word1_2-(sentimes subsentimes)
        assert hasattr(self, 'url_dict') and hasattr(
            self, 'article_dict'
        ), 'Please get(and merge) url and article before getting word frequency!'
        # 分词、获得所有词词典 ################################
        self.word_dict = OrderedDict()  # 保存所有词典及其在语料中的出现次数
        self.simultaneous_dict = OrderedDict()  # 保存包含某个词的文章的数量
        log_print('Article nums:{}'.format(len(self.article_dict)))
        self.segmentation, self.segmentation_pos = [], []
        all_word_times = 0
        for url, article in tqdm(self.article_dict.items(), 'Spliting texts'):
            # str:title, int:read_nums, int:like_nums, str:time, list:keywords, str:abstract,
            # str:article.text, list:segmentation
            tmp = article[0] + '\n' + article[6]
            cut_result = self.cut_lac(tmp)

            tmp_ws = []
            tmp_pos = []
            for i in range(len(cut_result[0])):
                # cut_result[0][i] = cut_result[0][i].replace(' ', '')
                if cut_result[0][i] != '':
                    if cut_result[0][i] in ',.!?;,。!?;\n':
                        # 只输出动词和名词
                        # for j in range(1, len(tmp_pos) - 1):
                        #     if tmp_pos[j] == 'v':
                        #         for k in range(j - 1, 0, -1):
                        #             if tmp_pos[k] == 'v':
                        #                 start = k
                        #                 break
                        #         for _ in range(k + 1, j):
                        #             print(tmp_ws[_], end='-')
                        #         print('[[{}]]'.format(tmp_ws[j]), end='-')
                        #         for k in range(j + 1, len(tmp_pos)):
                        #             if tmp_pos[k] != 'v':
                        #                 print(tmp_ws[k], end='-')
                        #             else:
                        #                 break
                        #         print()

                        # 输出所有词,动词用中括号标记
                        for j in range(len(tmp_pos)):
                            if tmp_pos[j] in ['v', 'vd']:
                                print('-[[{}]]-'.format(tmp_ws[j]), end='')
                            else:
                                print('-{}-'.format(tmp_ws[j]), end='')
                        print()
                        # if len(tmp_ns) > 1:
                        #     for j in range(len(tmp_ns) - 1):
                        #         for k in range(j + 1, len(tmp_ns)):
                        #             if tmp_ns[j] == tmp_ns[k]:
                        #                 continue
                        #             self.simultaneous_dict[
                        #                 '{}-{}'.format(tmp_ns[j], tmp_ns[k])] = self.simultaneous_dict.get(
                        #                 '{}-{}'.format(tmp_ns[j], tmp_ns[k]), 0) + 1
                        tmp_ws.clear()
                        tmp_pos.clear()
                    else:
                        # if cut_result[1][i] in ['n', 'f', 's', 'nw', 'nz', 'v']:
                        tmp_ws.append(cut_result[0][i])
                        tmp_pos.append(cut_result[1][i])

                if cut_result[0][i] not in self.word_dict:
                    self.word_dict[cut_result[0][i]] = [0] * 5
                self.word_dict[cut_result[0][i]][2] = self.word_dict.get(
                    cut_result[0][i])[2] + 1
            self.segmentation.append(cut_result[0])
            self.segmentation_pos.append(cut_result[1])
            all_word_times += len(cut_result[0])

            cut_result_set = set(cut_result[0])
            for word in cut_result_set:
                self.word_dict[word][3] = self.word_dict.get(word)[3] + 1

            keywords = article[4]
            for keyword in keywords:
                if keyword not in self.word_dict:
                    self.word_dict[keyword] = [0] * 5
                self.word_dict[keyword][4] = 1

        # 将文本中的词语转换为词频矩阵 ################################
        vectorizer = CountVectorizer()
        # # 计算个词语出现的次数
        log_print('Counting words by sklearn...')
        corpus = [' '.join(text) for text in self.segmentation]
        X = vectorizer.fit_transform(corpus)
        # 获取词袋中所有文本关键词
        words = vectorizer.get_feature_names()
        log_print('keyword nums:{}'.format(len(words)))
        # 查看关键词在文章中的出现次数
        # count_array = X.toarray()
        # print(X.toarray())
        # 查看关键词在语料中的出现次数
        for word in words:
            if word not in self.word_dict:
                self.word_dict[word] = [0] * 5
            self.word_dict[word][0] = 1

        # 不再统计TF-IDF
        # transformer = TfidfTransformer()
        # # 将词频矩阵X统计成TF-IDF值
        # log_print('Calculating TF-IDF by sklearn...')
        # tfidf = transformer.fit_transform(X)
        # # 查看数据结构 tfidf[i][j]表示i个文本中的关键词的tf-idf权重
        # tfidf_array = tfidf.toarray()

        # 将所有词典保存到本地
        # self.save_word_info()
        return self.word_dict, self.simultaneous_dict