SCRIPT_DIR = os.path.dirname(
    os.path.realpath(os.path.join(os.getcwd(), os.path.expanduser(__file__))))
root_dir = SCRIPT_DIR
for i in range(up_level_N):
    root_dir = os.path.normpath(os.path.join(root_dir, '..'))
sys.path.append(root_dir)

import requests, random, re
from bs4 import BeautifulSoup
from selenium import webdriver
from crawl_tools.ua_pool import get_one_random_ua
from crawl_tools.request_with_proxy import request_with_proxy
from journal_parser.JournalArticle import JournalArticle
from crawl_tools.decorators import except_pass, except_return_none
ERN_METHOD = lambda func: except_return_none(func, 'IEEE_PARSER')
EP_METHOD = lambda func: except_pass(func, 'IEEE_ARTICLE')
'''
@except_or_none
def get_pdf_link(pdf_page_url):
    with requests.Session() as s:
        soup = BeautifulSoup(
            s.get(
                url = pdf_page_url,
                timeout=30,
                headers = {
                    'User-Agent':get_one_random_ua()
                }
            ).text,"lxml"
        )
        try:
            soup.find_all('frame')[1].get('src')
Beispiel #2
0
"""
import sys, os
up_level_N = 1
SCRIPT_DIR = os.path.dirname(
    os.path.realpath(os.path.join(os.getcwd(), os.path.expanduser(__file__))))
root_dir = SCRIPT_DIR
for i in range(up_level_N):
    root_dir = os.path.normpath(os.path.join(root_dir, '..'))
sys.path.append(root_dir)

import re, requests
from bs4 import BeautifulSoup
from journal_parser.JournalArticle import JournalArticle
from crawl_tools.decorators import except_pass, except_return_none
ERN_METHOD = lambda func: except_return_none(func, 'TaylorFrancisParser')
EP_METHOD = lambda func: except_pass(func, 'TaylorFrancisArticle')


class TF_DetailPageParser:
    '''
        http://www.tandfonline.com/doi/abs/10.1080/08912968809386468
    '''
    def __init__(self, url):
        self.soup = BeautifulSoup(
            requests.get(
                'http://www.tandfonline.com/doi/abs/10.1080/08912968809386468'
            ).text, 'lxml')


class TaylorFrancisParser:
    '''
Beispiel #3
0
@description:
            nope
"""
import sys, os
up_level_N = 1
SCRIPT_DIR = os.path.dirname(
    os.path.realpath(os.path.join(os.getcwd(), os.path.expanduser(__file__))))
root_dir = SCRIPT_DIR
for i in range(up_level_N):
    root_dir = os.path.normpath(os.path.join(root_dir, '..'))
sys.path.append(root_dir)

from journal_parser.JournalArticle import JournalArticle
from bs4 import BeautifulSoup
from crawl_tools.decorators import except_pass
EP_METHOD = lambda func: except_pass(func, 'WileyArticle')
import re


class WileyAllItemsPageParser:
    '''
		sample_url:	http://onlinelibrary.wiley.com/doi/10.1002/(SICI)1096-987X(199812)19:16%3C%3E1.0.CO;2-O/issuetoc
	'''
    def __init__(self, html_source=None, from_web=True):
        if not from_web:
            with open('Wiley.html', 'rb') as f:
                html_source = f.read()
        self.soup = BeautifulSoup(html_source, 'lxml')

    @property
    def sections(self):
Beispiel #4
0
up_level_N = 1
SCRIPT_DIR = os.path.dirname(
    os.path.realpath(os.path.join(os.getcwd(), os.path.expanduser(__file__))))
root_dir = SCRIPT_DIR
for i in range(up_level_N):
    root_dir = os.path.normpath(os.path.join(root_dir, '..'))
sys.path.append(root_dir)

from crawl_tools.Timer import get_beijing_time
from crawl_tools.request_with_proxy import request_with_random_ua, request_with_proxy
from db_config import REMOTE_CONNS_POOL
import psycopg2, time, random
from crawl_tools.decorators import except_pass, except_return_none
from multiprocessing.dummy import Pool as ThreadPool

EP_METHOD = lambda func: except_pass(func, 'JournalSpider')
ERN_METHOD = lambda func: except_return_none(func, 'JournalSpider')


class JournalSpider:
    def __init__(self, JournalObj):
        self.JournalObj = JournalObj
        self.volume_links = []

    def generate_volume_links(self):
        pass

    @EP_METHOD
    def _run(self,
             AllItemsPageParser,
             JournalArticle,
Beispiel #5
0
            Parser for Emerald Publisher
"""
import sys, os
up_level_N = 1
SCRIPT_DIR = os.path.dirname(
    os.path.realpath(os.path.join(os.getcwd(), os.path.expanduser(__file__))))
root_dir = SCRIPT_DIR
for i in range(up_level_N):
    root_dir = os.path.normpath(os.path.join(root_dir, '..'))
sys.path.append(root_dir)

import re
from bs4 import BeautifulSoup
from journal_parser.JournalArticle import JournalArticle
from crawl_tools.decorators import except_pass
EP_METHOD = lambda func: except_pass(func, ModelName='EmeraldArticle')


class EmeraldParser:
    '''
        sample_url: http://www.emeraldinsight.com/toc/f/32/9%2F10
    '''
    def __init__(self, html_source=None, from_web=True):
        if not from_web:
            with open('./emerald.html', 'rb') as f:
                html_source = f.read()
        self.soup = BeautifulSoup(html_source, 'lxml')

    @property
    def sections(self):
        return self.soup.find_all(class_='articleEntry')
Beispiel #6
0
@description:
           Parser for Acs Publisher
"""
import sys,os
up_level_N = 1
SCRIPT_DIR = os.path.dirname(os.path.realpath(os.path.join(os.getcwd(), os.path.expanduser(__file__))))
root_dir = SCRIPT_DIR
for i in range(up_level_N):
    root_dir = os.path.normpath(os.path.join(root_dir, '..'))
sys.path.append(root_dir)

import re
from bs4 import BeautifulSoup
from journal_parser.JournalArticle import JournalArticle
from crawl_tools.decorators import except_pass
EP_METHOD = lambda func:except_pass(func,ModelName='AcsArticle')


class AcsParser:
    '''
        #sample_url:  http://pubs.acs.org/toc/mpohbp/0/0
    '''
    def __init__(self, html_source = None, from_web = True):
        if not from_web:
            with open("./pages/Acs.html", "rb") as f:
                html_source = f.read()
        self.soup = BeautifulSoup(html_source, 'lxml')

    @property
    def sections(self):
        return self.soup.select('.articleBox')
@description:
            --
"""
import sys,os
up_level_N = 1
SCRIPT_DIR = os.path.dirname(os.path.realpath(os.path.join(os.getcwd(), os.path.expanduser(__file__))))
root_dir = SCRIPT_DIR
for i in range(up_level_N):
    root_dir = os.path.normpath(os.path.join(root_dir, '..'))
sys.path.append(root_dir)

import re
from bs4 import BeautifulSoup
from journal_parser.JournalArticle import JournalArticle
from crawl_tools.decorators import except_pass
EP_METHOD = lambda func:except_pass(func,'SageArticle')


class SageParser:
    '''
        sample_url: http://tcn.sagepub.com/content/12/2.toc#content-block
    '''
    def __init__(self,html_source=None,from_web=True):
        if not from_web:
            with open('Sage.html','rb') as f:
                html_source = f.read()
        self.soup = BeautifulSoup(html_source,'lxml')

    @property
    def sections(self):
        return self.soup.select('.toc-cit')
Beispiel #8
0
"""

import sys, os
up_level_N = 1
SCRIPT_DIR = os.path.dirname(
    os.path.realpath(os.path.join(os.getcwd(), os.path.expanduser(__file__))))
root_dir = SCRIPT_DIR
for i in range(up_level_N):
    root_dir = os.path.normpath(os.path.join(root_dir, '..'))
sys.path.append(root_dir)

import re
from bs4 import BeautifulSoup
from journal_parser.JournalArticle import JournalArticle
from crawl_tools.decorators import except_pass
EP_METHOD = lambda func: except_pass(func, 'BioMedArticle')


class BioMedParser:
    def __init__(self, html_source=None, from_web=True):
        if not from_web:
            with open('BioMed.html', 'rb') as f:
                html_source = f.read()
        self.soup = BeautifulSoup(html_source, 'lxml')

    @property
    def pages_amount(self):
        return int(self.soup.find(text=re.compile('Page 1 of')).split(' ')[-1])

    @property
    def sections(self):