Exemple #1
0
    def _get_value(self, json_data, key, default="", convert=True):
        """
        这里的key应该是规则的key
        """
        value = default
        if key in json_data:
            value = json_data[key]
        elif key in self.reverse_key_map:
            for alias_key in self.reverse_key_map[key]:
                if alias_key in json_data:
                    value = json_data[alias_key]
                    match_key = alias_key
                    break

        ret = Utils.format_value(value, convert)

        #余下,可以针对特定key的value,做一些归一化处理
        if key == "release_date":
            if type(ret) is not list:
                ret = ret.replace("00:00:00", "")
                ret = Utils.format_datetime(ret)
            else:
                if len(ret) == 0:
                    return ""
                ret[0] = ret[0].replace("00:00:00", "")
                ret[0] = Utils.format_datetime(ret[0])
        return ret
class GitSpider:
    def __init__(self):
        self.links = _Settings().parse()
        self.util = Utils()

    def _get_words(self, url):
        text = self.util.req(url)
        if not text:
            return

        soup = bs4.BeautifulSoup(text, 'lxml')
        soup_article = soup.find('article')

        return soup_article.get_text(' ') if soup_article else None

    def _save(self, url, words):

        self.util.checkpath(PATH_DIR)
        if not words:
            return
        title = url.split('/')[-1]
        with open(PATH_DIR + '{}.txt'.format(title), 'w') as f:
            f.write(words)

    def start(self):

        if not self.links:
            return

        for url in self.links:
            words = self._get_words(url)
            self._save(url, words)
            print('successfully get {0} '.format(url))
    def __init__(self):

        # github projects which contain many python directories
        # 资源集合
        self.projectsPool = [
            # 'https://github.com/vinta/awesome-python'
        ]
        # dependent directories
        # 独立的仓库
        self.projectsUrl = ['https://github.com/zx576/scancode_backend']
        # invoke general class
        # 爬虫工具箱
        self.util = Utils()
Exemple #4
0
 def _transform_wiley(self, json_data):
     doi = self._get_value(json_data, "doi")
     if doi == "":
         doi = Utils.regex_extract(self._get_value(json_data, "access_url"),
                                   ".*onlinelibrary.wiley.com/doi/(.*)")
         json_data["doi"] = doi
     return json_data
Exemple #5
0
    def _format_author_sup(self, origin_sup):
        """
        Format author sup
        for example:
            case 1:
                origin_sup: a-c,d
                return: 1,2,3,4
            case 2:
                origin_sup: 1-4
                return: 1,2,3,4
        """
        origin_sup = origin_sup.encode("utf-8").replace(" ", "").replace(
            "–", "-").strip("").strip(",").replace(",", "-")
        origin_sup = "".join([
            str(ord(sup) - ord('a') + 1) if sup.isalpha() else sup
            for sup in origin_sup
        ])
        origin_sup = Utils.regex_extract(origin_sup, "\w*(\d+)")
        if origin_sup == "":
            origin_sup = '1'
        sups = origin_sup.split(",")
        res = []
        for sup in sups:
            if sup.find("-") != -1:
                elems = sup.split("-")
                if len(elems) != 2:
                    raise Exception("unexcept sup: %s" % origin_sup)
                start = int(elems[0])
                end = int(elems[1])
                res.extend(range(start, end + 1))
            else:
                res.append(sup)

        ret = ",".join([str(e) for e in res])
        return ret
Exemple #6
0
 def _transform_sage(self, json_data):
     #用portia爬取的sage,少了doi
     doi = self._get_value(json_data, "doi")
     if doi == "":
         doi = Utils.regex_extract(self._get_value(json_data, "access_url"),
                                   ".*sagepub.com/doi/full/(.*)")
         json_data["doi"] = doi
     return json_data
Exemple #7
0
    def start(self):
        self._load_journal_meta()
        with open(self.origin_meta_file) as f:
            for line in f:
                try:
                    line = line.strip()
                    json_data = json.loads(line)
                except Exception as e:
                    continue

                if json_data["from_url"] not in self.journal_meta:
                    raise Exception("conference_url %s not excepted" % json_data["from_url"])

                journal_meta = self.journal_meta[json_data["from_url"]]

                new_data = {}
                new_data["id"] = journal_meta["id"]
                new_data["conference"] = journal_meta["conference"]
                new_data["issn"] = json_data.get("issn")
                new_data["title"] = json_data.get("title")
                new_data["abstract"] = json_data.get("abstract")
                new_data["author"] = json_data.get("author")
                new_data["keywords"] = json_data.get("keywords")
                new_data["release_year"] = json_data.get("release_year")

                page = json_data.get("page", "").replace("Pages", "").replace("Page", "").strip()
                page_infos = page.split("-")
                if len(page_infos) != 1:
                    start_page = Utils.str_to_num(page_infos[0].strip())
                    end_page = Utils.str_to_num(page_infos[1].strip())
                    total_page = end_page - start_page + 1
                else:
                    start_page = Utils.str_to_num(page_infos[0].strip())
                    end_page = start_page
                    total_page = 1

                new_data["start_page"] = start_page
                new_data["end_page"] = end_page
                new_data["total_page"] = total_page
                new_data["pdf_path"] = os.path.join(
                    journal_meta["id"], Utils.get_pdf_filename(json_data))
                new_data["doi"] = json_data.get("doi")
                new_data["conference_url"] = json_data["from_url"]
                new_data["access_url"] = json_data["access_url"]
                new_data["pdf_url"] = json_data["pdf_url"]
                print json.dumps(new_data)
Exemple #8
0
    def __init__(self):

        # 手动设置
        # topic links
        self.topic = [
            # python topic
            # 'https://stackoverflow.com/documentation/python/topics'
            # 'https://stackoverflow.com/documentation/django/topics',
            # 'https://stackoverflow.com/documentation/algorithm/topics',
            'https://stackoverflow.com/documentation/git/topics',
            # 'https://stackoverflow.com/documentation/design-patterns/topics',
            # 'https://stackoverflow.com/documentation/flask/topics'
        ]
        # question links
        self.res = []
        # =======================
        #  dont change anything below
        self.util = Utils()
        self.domain = 'https://stackoverflow.com'
Exemple #9
0
    def _transform_scielo(self, json_data):
        if type(json_data["date"]) is list:
            tmp = " ".join(json_data["date"])
        else:
            tmp = json_data["date"]
        dates = tmp.encode("utf-8").replace("\xa0",
                                            " ").replace("\xc2", " ").replace(
                                                ".", "").split()

        if len(dates) == 1:
            date = dates[0]
        else:
            try:
                date = Utils.format_datetime(" ".join(dates[-2:]))
            except Exception:
                try:
                    date = Utils.format_datetime(" ".join(dates[-3:]))
                except Exception:
                    date = "%s-08-01" % dates[-1]

        json_data["release_date"] = date
        return json_data
Exemple #10
0
 def __init__(self):
     # super(_Down, self).__init__()
     self.util = Utils()
     # 某文档信息
     # self.url = 'https://interactivepython.org/courselib/static/pythonds/index.html'
     # self.domain = 'https://interactivepython.org/courselib/static/pythonds/'
     # self.title = 'Problem Solving with Algorithms and Data Structures using Python.txt'
     # self.url = 'http://chimera.labs.oreilly.com/books/1230000000393/index.html'
     # self.domain = 'http://chimera.labs.oreilly.com/books/1230000000393/'
     # self.title = 'Python Cookbook.txt'
     self.url = 'http://docs.peewee-orm.com/en/stable/'
     self.domain = self.url
     self.title = 'peewee.txt'
Exemple #11
0
class Stspider:
    def __init__(self):
        self.links = _Settings().parse()
        self.util = Utils()

    # 获取所有文字内容
    def _get_words(self, url):
        page = self.util.req(url)
        if not page:
            return
        soup = bs4.BeautifulSoup(page, 'lxml')
        body = soup.find('body')
        if not body:
            return
        else:
            words = body.get_text(' ')

        return words

    # 保存文字内容
    def _save(self, url, words):

        self.util.checkpath(PATH_DIR)
        if not words:
            return
        title = url.split('/')[-1]
        with open(PATH_DIR + '{}.txt'.format(title), 'w') as f:
            f.write(words)

    # 启动
    def start(self):

        if not self.links:
            return

        for url in self.links:
            words = self._get_words(url)
            self._save(url, words)
            print('successfully get {0} '.format(url))
Exemple #12
0
class _Settings():
    def __init__(self):

        # 手动设置
        # topic links
        self.topic = [
            # python topic
            # 'https://stackoverflow.com/documentation/python/topics'
            # 'https://stackoverflow.com/documentation/django/topics',
            # 'https://stackoverflow.com/documentation/algorithm/topics',
            'https://stackoverflow.com/documentation/git/topics',
            # 'https://stackoverflow.com/documentation/design-patterns/topics',
            # 'https://stackoverflow.com/documentation/flask/topics'
        ]
        # question links
        self.res = []
        # =======================
        #  dont change anything below
        self.util = Utils()
        self.domain = 'https://stackoverflow.com'

    # 解析这个 topic 下的所有答案链接
    def _parse_topic(self):
        if not self.topic:
            return
        for url in self.topic:
            self._add_url(url)

    def _add_url(self, url):

        page = self.util.req(url)
        if not page:
            return
        soup = bs4.BeautifulSoup(page, 'lxml')
        soup_a = soup.find_all('a', class_='doc-topic-link')
        for a in soup_a:

            last = a.get('href', None)
            self.res.append(self.domain + last)

        soup_next = soup.find('a', attrs={'rel': 'next'})
        # get next page
        if soup_next:

            next_url = self.domain + soup_next['href']
            return self._add_url(next_url)

    def parse(self):

        self._parse_topic()
        return self.res
class _Settings:
    def __init__(self):

        # github projects which contain many python directories
        # 资源集合
        self.projectsPool = [
            # 'https://github.com/vinta/awesome-python'
        ]
        # dependent directories
        # 独立的仓库
        self.projectsUrl = ['https://github.com/zx576/scancode_backend']
        # invoke general class
        # 爬虫工具箱
        self.util = Utils()

    # parse projects(like awesome-python)
    # return all directories' url which domain url are github.com
    # 解析类似 awesome-python 的项目,返回所有项目的 github 地址,过滤掉指向站外的 url
    def _parse_pool(self):

        if not self.projectsPool:
            return []

        links = []
        for project in self.projectsPool:
            page = self.util.req(project)
            if not page:
                continue
            links += self._parse_html_get_links(page)

        return links

    # use bs4 parse html
    # return all links
    def _parse_html_get_links(self, page):

        soup = bs4.BeautifulSoup(page, 'lxml')
        soup_a = soup.find_all('a', href=re.compile('https://github.com/'))
        links = []
        for a in soup_a:
            links.append(a['href'])

        return links

    def parse(self):

        # deduplicate urls
        return list(set(self.projectsUrl + self._parse_pool()))
Exemple #14
0
    def _init(self):
        #reverse_key_map记录了,那些规则的key,可能有哪些不规则的表达
        self.reverse_key_map = {}
        for k, v in self.key_map.items():
            if v in self.reverse_key_map:
                self.reverse_key_map[v].append(k)
            else:
                self.reverse_key_map[v] = [k]

        #创建一个workdir
        workdir = Utils.generate_workdir()
        self.pass_meta_file = os.path.join(
            workdir,
            self.pass_meta_file)  #workdir + "\\" + self.pass_meta_file
        self.bad_meta_file = os.path.join(
            workdir, self.bad_meta_file)  #workdir + "\\" + self.bad_meta_file
        self.miss_pdf_file = os.path.join(
            workdir, self.miss_pdf_file)  #workdir + "\\" + self.miss_pdf_file
        self.pass_meta_writer = open(self.pass_meta_file, "w")
        self.bad_meta_writer = open(self.bad_meta_file, "w")
        self.miss_pdf_writer = open(self.miss_pdf_file, "w")
Exemple #15
0
    def check_miss_journal(self):
        """
        check miss journal, args:
        python meta_check.py --file xxx --journal_file xxx(xls file) [--source_name wiley] 
        """
        required_args = ['file', 'journal_file']
        help_message = "python meta_check.py --file xxx --journal_file xxx(xls_file)" \
        "[--source_name xxx(f.e: wiley)]"
        self._verify_args(required_args, help_message)
        all_journal_meta = Utils.load_journal_meta(self.args.journal_file)
        should_crawl_journal_meta = {}
        if self.args.source_name is not None:
            for journal, meta in all_journal_meta.iteritems():
                journal_url = meta['journal_url'].lower()
                source_name = self.args.source_name.lower()
                if journal_url.find(source_name) == -1:
                    continue
                else:
                    should_crawl_journal_meta[journal] = meta
        else:
            should_crawl_journal_meta = all_journal_meta

        crawled_journal = {}
        with open(self.args.file) as fp:
            for line in fp:
                try:
                    json_data = json.loads(line)
                except Exception as e:
                    continue
                journal = json_data['journal'].lower()
                if journal in crawled_journal:
                    crawled_journal[journal] = crawled_journal[journal] + 1
                else:
                    crawled_journal[journal] = 1

        for journal, meta in should_crawl_journal_meta.iteritems():
            if journal not in crawled_journal:
                print "miss %s, url: %s" % (journal,
                                            "%s/issues" % meta['journal_url'])
Exemple #16
0
 def __init__(self):
     self.util = Utils()
Exemple #17
0
    def _transform_intechopen(self, json_data):
        item_type = self._get_value(json_data, "_type")
        if item_type == "book_item":
            #book主要是从content里面抽取一些字段出来
            content = self._get_value(json_data, "content")
            #Edited by Aldemaro Romero and Edward O. Keith , ISBN 978-953-51-0844-3, 248 pages, Publisher: InTech, Chapters published November 07, 2012 under CC BY 3.0 license DOI: 10.5772/2731 Edited Volume
            #Authored by Amira Abdelrasoul, Huu Doan and Ali Lohi , ISBN 978-953-51-3662-0, Print ISBN 978-953-51-3661-3, 232 pages, Publisher: InTech, Chapters published December 06, 2017 under CC BY-NC 4.0 license DOI: 10.5772/65691 Monograph
            #ISBN 978-953-51-3376-6, Print ISBN 978-953-51-3375-9, 262 pages, Publisher: InTech, Chapters published August 23, 2017 under CC BY 3.0 license DOI: 10.5772/intechopen.68449 Monograph
            content_regex = re.compile(
                "(?P<authors>(Edited|Authored) by .*, )?ISBN (?P<isbn>[\w-]+), (?P<print_isbn>Print ISBN .*, )?(?P<pages>\d+) pages, Publisher: (?P<publisher>.*), Chapters published (?P<publish_date>.*) under (?P<license_type>.* license).*"
            )
            match = content_regex.match(content)
            if not match:
                print json_data
                raise Exception("content not match regex: %s" % content)

            json_data['dc:creater'] = match.group("authors")
            json_data['eisbn'] = match.group("isbn")
            json_data['hardcover_PISBN'] = match.group("print_isbn")
            json_data['page'] = match.group("pages")
            json_data['publisher'] = match.group("publisher")
            json_data['release_date'] = Utils.strptime(
                match.group("publish_date")).strftime("%Y-%m-%d")
            json_data['license_type'] = match.group("license_type")

            json_data.pop('chapters', None)
            json_data.pop('content', None)

        elif item_type == "chapter_item":
            #chapter主要是抽取处理一下作者机构
            if self._key_exist(json_data, "author_affliication"):
                #表示
                author = self._get_value(json_data, "author")
                start_chars = "<div class=\"authors-front\">"
                end_chars = "</div>"
                author_content = Utils.extract_chars(author, start_chars,
                                                     end_chars)
                if author_content == "":
                    #还有获取不到作者的情况..
                    author_content = self._get_value(json_data, "author_field")

                #有的作者可能有多个sup的情况,比如<sup>1, </sup><sup>2</sup>,需要归一化
                author_content = re.sub("<sup>(\d*)(, ){0,1}</sup>",
                                        "<sup>\g<1></sup>", author_content)
                #有的作者还是以and分隔的
                author_content.replace("and", ",")
                author_elems = author_content.split(",")

                authors = []
                author_sups = []
                author_affliication = json_data['author_affliication']
                author_affliication = [x for x in author_affliication if \
                    x.strip().startswith('[')]
                for author_elem in author_elems:
                    sup_start_chars = "<sup>"
                    sup_end_chars = "</sup>"
                    try:
                        sup_start_index = author_elem.index(sup_start_chars)
                        author_text = author_elem[0:sup_start_index]
                        sup = Utils.extract_chars(author_elem, sup_start_chars,
                                                  sup_end_chars)
                    except Exception as e:
                        sup = "1"
                        author_text = author_elem

                    if not sup.isdigit():
                        sup = "1"

                    authors.append(author_text)
                    author_sups.append(sup)

                json_data = Utils.format_authors(json_data, authors,
                                                 author_sups,
                                                 author_affliication)
                json_data.pop('author_field', None)

        return json_data
Exemple #18
0
    def start(self):
        #a = "<font size=\"5\"><a name=\"top1\"></a>Efeito de fungicidas na    germinação <i>in vitro</i> de conídios de <i>Claviceps    africana</i><sup>(<a href=\"#back1\">1</a>)</sup></font>"
        #self._format_scielo_authors(a)
        #sys.exit(0)
        with open(self.meta_file) as f:
            for line in f:
                try:
                    self.total = self.total + 1
                    line = line.strip()
                    json_data = json.loads(line)
                except Exception as e:
                    self.json_fail = self.json_fail + 1
                    continue

                #检查1:access url是最基本的字段了,如果这个都没爬取下来,那连问题都没法定位了
                access_url = self._get_value(json_data, "access_url")
                if access_url == "":
                    self.no_access_url = self.no_access_url + 1
                    continue

                #针对不同的平台,对元数据做一些特殊处理
                json_data = self._transform(json_data)

                #检查2:检查是否采集必备字段
                miss_required_filed = False
                for key in self.required_keys:
                    value = self._get_value(json_data, key)
                    if value == "":
                        #value为空,表示元数据未采集到此字段
                        bad_record = {}
                        bad_record['reason'] = "%s empty" % key
                        bad_record['access_url'] = access_url
                        self._mark_bad_record(bad_record)
                        self.incomplete = self.incomplete + 1
                        miss_required_filed = True
                        break

                if miss_required_filed:
                    continue

                #检查3:检查元数据里面是否有非空字段
                fail = False
                for key, value in json_data.iteritems():
                    key = key.strip(":").strip()
                    value = Utils.format_value(value)
                    if value == "" and key in self.required_keys:
                        if key == "release_year":
                            publish_data = self._get_value(
                                json_data, "release_date")
                            if publish_data != "":
                                #if publish data is also empty, there is no way to get publish_year
                                json_data["release_year"] = publish_data.split(
                                    "-")[0]
                                print "publish year is %s" % json_data[
                                    "release_year"]
                                continue

                        bad_record = {}
                        bad_record['reason'] = "%s empty" % key
                        bad_record['access_url'] = access_url
                        self._mark_bad_record(bad_record)
                        self.incomplete = self.incomplete + 1
                        fail = True
                        break

                if fail:
                    continue

                #检查4:补充一些必备字段
                json_data['acquisition_time'] = Utils.current_time()
                publish_year = self._get_value(json_data, "release_year")
                if publish_year == "":
                    publish_data = self._get_value(json_data, "release_date")
                    if publish_data != "":
                        json_data["release_year"] = publish_data.split("-")[0]

                #处理一下author、author_sub、author_affiliation等字段

                if access_url in self.pass_meta_map:
                    title = self._get_value(json_data, "title")
                    if title != self.pass_meta_map[access_url]:
                        pass
                        #raise Exception("same url with different title, not gonna happen :%s" % access_url)
                    self.dup = self.dup + 1
                    continue

                self.pass_count = self.pass_count + 1
                self._mark_success_record(json_data)
                self.pass_meta_map[access_url] = json_data["title"]

        if self.args.pdf_dir is not None:
            print "total: %d, no_access_url: %d, json_fail: %d, incomplete: %d, dup_count: %d, pass_count: %d. pdf_non_exist: %d, pdf_exist_count: %d, pass meta save to: %s, fail meta save to :%s, miss pdf url save to :%s" \
            % (self.total, self.no_access_url, self.json_fail, self.incomplete, self.dup, self.pass_count, self.pdf_non_exist, self.pdf_exist, self.pass_meta_file, self.bad_meta_file, self.miss_pdf_file)
        else:
            print "total: %d, no_access_url: %d, json_fail: %d, incomplete: %d, dup_count: %d, pass_count: %d. pass meta save to: %s, fail meta save to :%s" \
            % (self.total, self.no_access_url, self.json_fail, self.incomplete, self.dup, self.pass_count, self.pass_meta_file, self.bad_meta_file)
Exemple #19
0
 def __init__(self):
     self.links = _Settings().parse()
     self.util = Utils()
Exemple #20
0
    def _mark_success_record(self, json_data):
        """
        Mark an success record.

        1. 对Key做一些归一化的工作,同时也可以加一些必备的字段
        2. 如果指定了pdf save dir,则会和pdf文件做拼接
        3. 去掉一些无用的key(比如portia爬取,会加上_template这样的字段)
        @param json_data
        """
        publish_data = self._get_value(json_data, "release_date")
        if "release_year" not in json_data or json_data["release_year"] == "":
            #if publish data is also empty, there is no way to get publish_year
            if "release_date" in json_data:
                json_data["release_year"] = publish_data.split()[-1]

        if "keywords" in json_data:
            if type(json_data["keywords"]) is list \
            and len(json_data["keywords"]) == 1:
                #portia有的期刊爬取keywords,都写到一个元素里面了,应该拆开
                keywords = json_data["keywords"][0].replace("Keywords",
                                                            "").strip()
                json_data["keywords"] = keywords.split(";")
                if len(json_data["keywords"]) == 1:
                    json_data["keywords"] = keywords.split(",")
            elif self.for_oa and type(json_data["keywords"]) is not list:
                keywords = json_data["keywords"].replace(
                    "Index terms:", "").replace("Keywords", "").split(";")
                json_data["keywords"] = keywords

                #如果是oa,且keywords不是list,要转化为list

        convert_data = {}
        for key, value in json_data.iteritems():
            format_key = key.strip(":").strip().lower()

            if format_key in self.key_map:
                format_key = self.key_map[format_key]

            #这类key,不会把list转换为string
            if format_key in self.reserved_non_converted_keys:
                convert = False
            else:
                convert = not self.for_oa

            value = self._get_value(
                json_data, format_key,
                convert=convert)  #这里使用_get_value,会对value做一些归一化
            convert_data[format_key] = value

        # 归一化作者和作者机构
        is_scielo = False
        if is_scielo:
            #2018.04.18 崩溃了,scielo的作者单独处理
            if convert_data["author"][0].find("<") != -1:
                author_raw_text = " ".join(convert_data["author"])
                authors = self._format_scielo_authors(author_raw_text)
                convert_data['author'] = authors

            convert_data.pop("author_affiliation", None)
        elif 'author' in convert_data and len(
                convert_data["author"]
        ) == 1:  #这种author可能是一坨html,包含了多个作者并且每个作者的sup也标记在<sup>里面
            #这种情况属于作者机构不太好爬,直接把html文档都爬取到author字段了
            authors, author_sups = self._format_authors(convert_data)
            if 'author_sup' not in convert_data:
                convert_data['author_sup'] = author_sups
            else:
                convert_data['author_sup'] = [
                    self._format_author_sup(sup)
                    for sup in convert_data['author_sup']
                ]
            if len(authors) == 1:
                #如果此时author还是只有一个元素,那么author可能是,分隔的
                authors = authors[0].split(",")

            convert_data['author'] = authors

        if 'author_sup' in convert_data:
            convert_data['author_sup'] = [
                self._format_author_sup(sup)
                for sup in convert_data['author_sup']
            ]

        if "author_affiliation" in convert_data and len(
                convert_data['author_affiliation']) == 1:
            #这种author_affiliation可能是一坨html
            author_affiliation = convert_data['author_affiliation'][0]
            try:
                authors = convert_data['author']
                if author_affiliation.startswith(authors[0]):
                    #这种作者机构是以作者分隔的,比如:https://koedoe.co.za/index.php/koedoe/article/view/188
                    author_affiliation = self._format_author_affiliations_by_author(
                        author_affiliation, authors)
                else:
                    author_affiliation = self._format_author_affiliations(
                        convert_data)
                convert_data['author_affiliation'] = author_affiliation
            except Exception as e:
                #没爬到作者,却爬到了作者机构,先忽略这种情况吧
                authors = []
                convert_data['author_affiliation'] = []
                convert_data['author'] = []
                convert_data['author_sup'] = []

        #有的author_sup里面,会有空字符串,比如scielo
        if "author_sup" in convert_data and type(
                convert_data["author_sup"]) is list:
            convert_data["author_sup"] = [
                i for i in convert_data["author_sup"] if i != ''
            ]

        if self.args.pdf_dir is not None:
            filename = Utils.get_pdf_filename(json_data)
            pdf_path = os.path.join(self.args.pdf_dir, filename + ".pdf")
            txt_path = os.path.join(self.args.pdf_dir, filename + ".txt")
            if os.path.exists(pdf_path):
                convert_data["pdf_path"] = filename + ".pdf"
                self.pdf_exist = self.pdf_exist + 1
            elif os.path.exists(txt_path):
                convert_data["pdf_path"] = filename + ".txt"
                self.pdf_exist = self.pdf_exist + 1
            else:
                #print "pdf path(%s) or txt path(%s) not exist" % (pdf_path, txt_path)
                convert_data["pdf_path"] = "wrong"
                self.pdf_non_exist = self.pdf_non_exist + 1
                pdf_link = self._get_value(json_data, "pdf_url")
                if pdf_link == "":
                    raise Exception("cannot get pdf_url from json %s" %
                                    json_data)
                self.miss_pdf_writer.write(pdf_link)
                self.miss_pdf_writer.write("\n")

        #归一化author,author_affiliation
        if not self.for_oa:
            pass
            #convert_data = Utils.format_authors_from_json(convert_data)

        #去掉一些key
        if not self.for_oa:
            convert_data.pop("author_sup", None)
        else:
            #oa的需要特别处理下
            convert_data["doi"] = self._get_value(convert_data, "doi").replace(
                "https://doi.org/", "").replace("http://doi.org/", "")
        convert_data.pop('_template', None)

        convert_data_str = json.dumps(convert_data)
        self.pass_meta_writer.write(convert_data_str)
        self.pass_meta_writer.write("\n")
Exemple #21
0
class _Down:
    def __init__(self):
        self.util = Utils()

    def _save(self, title, words):

        self.util.checkpath(PATH_DIR)
        if not words:
            return
        with open(PATH_DIR + title, 'a+') as f:
            f.write(words)

    # 递归抓取某文档所有链接
    def _download(self, qu, domain, title, switch=True):
        # print(title)
        if qu.empty():
            return

        url = qu.get()
        text = self.util.req(url)

        if not text:
            # qu.put(url)
            return self._download(qu, domain, title, False)

        if switch:
            res = self._download_links(domain, text)
            for i in res:
                qu.put(i)

        words = self._download_docs(text)
        self._save(title, words)

        return self._download(qu, domain, title, switch=False)

    def _download_docs(self, page):

        soup = bs4.BeautifulSoup(page, 'lxml')
        soup_body = soup.find('body')
        words = ''
        if soup_body:
            words += soup_body.get_text(' ')

        return words

    def _download_links(self, domain, page):

        lst = []
        soup = bs4.BeautifulSoup(page, 'lxml')
        soup_link = soup.find_all('a')
        for link in soup_link:
            lst.append(domain + link['href'])

        return lst

    def download(self, url, domain, title):
        # title = 'Problem Solving with Algorithms and Data Structures using Python.pdf'
        qu = queue.Queue()
        qu.put(url)

        return self._download(qu, domain, title)
# coding=utf-8
# author = zhouxin
# description
# 下载 pdf 文件, 将 pdf 下载地址添加到 downlst ,运行程序即可

import requests
from spiders.utils import Utils

PATH_DIR = 'download/'
util = Utils()


def download(url):

    util.checkpath(PATH_DIR)

    req = requests.get(url)
    c = req.content
    name = url.split('/')[-1]
    with open(PATH_DIR + name, 'wb') as f:
        f.write(c)


downlst = [
    # 'http://files2.syncfusion.com/Downloads/Ebooks/SciPy_Programming_Succinctly.pdf',
    # 'https://docs.google.com/file/d/0B8IUCMSuNpl7MnpaQ3hhN2R0Z1k/edit'
    # 'http://stock.ethop.org/pdf/python/Learning%20Python,%205th%20Edition.pdf',
    # 'http://slav0nic.org.ua/static/books/python/OReilly%20-%20Core%20Python%20Programming.pdf',
    #  ///////////
    # 'http://www.oreilly.com/programming/free/files/functional-programming-python.pdf',
    # 'https://doc.lagout.org/programmation/python/Python%20Pocket%20Reference_%20Python%20in%20Your%20Pocket%20%285th%20ed.%29%20%5BLutz%202014-02-09%5D.pdf',
Exemple #23
0
from spiders.utils import Utils

filename = sys.argv[1]
columnname = sys.argv[2]
split = '|'

with open(filename) as fp:
    for line in fp:
        try:
            json_date = json.loads(line)
        except Exception as e:
            continue
        columns = columnname.split(",")
        line = ""
        for column in columns:
            try:
                data = Utils.format_value(json_date[column], join_char='|')
            except Exception as e:
                data = ""

            if column == 'url':
                data = re.sub("\?journalCode=.*", "", data)

            if isinstance(data, int):
                line += str(data) + split
            else:
                line += data.replace('\n', '').replace('\t',
                                                       '').strip() + split
        #print line.strip().replace(u'ê', 'e').replace(u'é', 'e').replace(u'ã', 'a').replace(u'ó', 'o').replace(u'ú', 'u').strip(split)
        print line.strip().strip(split)