Exemple #1
0
    def getSparkLogger(cls, logfile="", conffile=""):
        if (logfile == ""):
            logfile = Conf.getconf("logdir", conffile=conffile) + Conf.getconf(
                "logfile", conffile=conffile)
        from pyspark import SparkContext
        #sc = SparkContext.getOrCreate()
        #log4j = sc._jvm.org.apache.log4j
        #logger = log4j.LogManager.getLogger("myLogger")
        #logger = log4j.LogManager.getLogger(__name__)
        #logger = log4j.LogManager.getRootLogger()
        #logger.appender.FILE.File="../../var/log/log"
        #logger.setLevel(log4jLogger.Level.DEBUG)
        #logger.info("aaaa")
        #return logger

        if not 'LOG_DIRS' in os.environ:
            sys.stderr.write(
                'Missing LOG_DIRS environment variable, pyspark logging disabled'
            )
            return

        file = os.environ['LOG_DIRS'].split(',')[0] + '/log'
        logging.basicConfig(
            filename=file,
            level=logging.INFO,
            format=
            '%(asctime)s.%(msecs)03d %(levelname)s %(module)s - %(funcName)s: %(message)s'
        )
        logger = logging.getLogger()
        return logger
Exemple #2
0
    def getLogger(cls, logfile="", conffile=""):
        from conf import Conf

        if (logfile == ""):
            logfile = Conf.getconf("logdir", conffile=conffile) + Conf.getconf(
                "logfile", conffile=conffile)
        loglevel = Conf.getconf("loglevel", conffile=conffile)
        rotate_log_size = Conf.getconf("rotate_log_size")

        import logging, logging.handlers
        logger = logging.getLogger()

        if len(logger.handlers) < 1:
            #fh = logging.FileHandler(filename="../../var/log/log2")
            #logger.addHandler(fh)
            rfh = logging.handlers.RotatingFileHandler(
                filename=logfile,
                maxBytes=rotate_log_size,
                backupCount=Conf.getconf("backup_log_count"))
            formatter = logging.Formatter(
                '%(asctime)s - %(levelname)s - %(message)s')
            rfh.setFormatter(formatter)
            logger.addHandler(rfh)

        #logging.basicConfig(filename="../../var/log/log2")

        id_ = id(logger)
        logger.setLevel(eval("logging." + loglevel))
        logger.debug(
            "return logger\n logfile[{logfile}]\n rotate_log_size[{rotate_log_size}]\n id[{id_}]"
            .format(**locals()))
        return logger
Exemple #3
0
 def __init__(self):
     self.log = Log.getLogger()
     self.driver = self.create_driver()
     self.top_url = Conf.getconf("kakaku_top_page")
     self.target_stores = Conf.getconf("target_stores")
     self.extract_store_name = re.compile(r"\'")
     self.warning_messages = False
     self.log.debug(__class__.__name__ + "." +
                    sys._getframe().f_code.co_name + " start")
Exemple #4
0
 def move_to_vendor_page(self, vendor_button):
     self.log.debug(__class__.__name__ + "." +
                    sys._getframe().f_code.co_name + " start.")
     self.driver.get(vendor_button.get_attribute("href"),
                     warning_messages=self.warning_messages)
     self.log.debug("wait start")
     for sec in range(Conf.getconf("phantomJS_load_timeout")):
         self.log.debug("wait redirect " + str(sec) + "[sec]")
         if self.driver.title:
             self.log.debug("move to shop page finished. page title: " +
                            self.driver.title)
             break
         time.sleep(Conf.getconf("vendor_page_wait_time"))
     self.log.debug(__class__.__name__ + "." +
                    sys._getframe().f_code.co_name + " finished.")
Exemple #5
0
    def save_current_page(self, filename):
        self.log.debug(__class__.__name__ + "." +
                       sys._getframe().f_code.co_name + " start")
        path, suffix = os.path.splitext(filename)
        max_filename_length = Conf.getconf("max_filename_length")
        if len(path) > max_filename_length:
            self.log.debug("filename too long. convert from :" + filename)
            filename = path[:max_filename_length] + suffix
            self.log.debug("to :" + filename)

        self.log.debug("path[" + path + "], suffix[" + suffix + "]")
        if suffix == ".html":
            f = open(filename, 'w')
            f.write(self.page_source)
            f.close()
        elif suffix == ".png":
            self.save_screenshot(filename)
        elif suffix == ".pdf":
            pngname = os.path.splitext(filename)[0] + ".png"
            self.save_screenshot(pngname)
            self.convert_png_to_pdf(pngname)
        else:
            self.log.error(__class__.__name__ + "." +
                           sys._getframe().f_code.co_name)
            self.log.error("TYPEERROR suffix[" + suffix + "]")
        self.log.debug("saved to " + filename)
        self.log.debug(__class__.__name__ + "." +
                       sys._getframe().f_code.co_name + " finished")
Exemple #6
0
 def getLogger(cls, logfile="", conffile=""):
     exec_env = Conf.getconf("exec_env")
     if exec_env == "normal":
         return cls.getNormalLogger(logfile=logfile, conffile=conffile)
     elif exec_env == "spark":
         return cls.getSparkLogger(logfile=logfile, conffile=conffile)
     else:
         sys.exit("getLogger Type Error[" + str(exec_env) + "]")
Exemple #7
0
    def getNormalLogger(cls, logfile="", conffile=""):
        if logfile == "":
            logfile = Conf.getconf("logdir", conffile=conffile) + Conf.getconf(
                "logfile", conffile=conffile)

        logger = logging.getLogger(logfile)
        if len(logger.handlers) > 1:  # called before and already created.
            return logger

        loglevel = Conf.getconf("loglevel", conffile=conffile)
        rotate_log_size = Conf.getconf("rotate_log_size")
        if len(logger.handlers) < 1:
            rfh = logging.handlers.RotatingFileHandler(
                filename=logfile,
                maxBytes=rotate_log_size,
                backupCount=Conf.getconf("backup_log_count"))
            formatter = logging.Formatter(
                '%(asctime)s - %(levelname)s - %(message)s')
            rfh.setFormatter(formatter)
            logger.addHandler(rfh)

            if Conf.getconf("loglevel_to_stdout", conffile=conffile):
                stream_handler = logging.StreamHandler()
                stream_handler.setFormatter(formatter)
                stream_handler.setLevel(\
                 Conf.getconf("loglevel_to_stdout", conffile=conffile))
                logger.addHandler(stream_handler)

        id_ = id(logger)
        logger.setLevel(eval("logging." + loglevel))
        logger.debug(
            "return normal logger\n logfile[{logfile}]\n rotate_log_size[{rotate_log_size}]\n id[{id_}]"
            .format(**locals()))
        return logger
	def establish_session(self):
		self.log.debug(__class__.__name__ + "." + sys._getframe().f_code.co_name + " start")
		import sqlalchemy
		from conf import Conf
		self.engine = sqlalchemy.create_engine(Conf.getconf("myslq_url"), echo=False)
		from sqlalchemy.orm import sessionmaker
		Session = sessionmaker(bind=self.engine)
		session = Session()
		session.expire_on_commit = False
		self.log.debug(__class__.__name__ + "." + sys._getframe().f_code.co_name + " finished")
		return session
    def __init__(self, executable_path="",\
        port=0, desired_capabilities=DesiredCapabilities.PHANTOMJS,\
        service_args=None, service_log_path=None):
        self.executable_path = executable_path
        self.port = port
        self.PHANTOMJS = desired_capabilities
        self.service_args = service_args
        self.service_log_path = service_log_path

        self.log = Log.getLogger()
        self.log.debug(__class__.__name__ + "." +
                       sys._getframe().f_code.co_name + " start")

        import logging, logging.handlers
        selenium_logger = logging.getLogger(
            'selenium.webdriver.remote.remote_connection')
        selenium_logger.setLevel(logging.ERROR)
        if len(selenium_logger.handlers) < 1:
            rfh = logging.handlers.RotatingFileHandler(
                filename=Conf.getconf("logdir") +
                Conf.getconf("phantomjs_logfile"),
                maxBytes=Conf.getconf("rotate_log_size"),
                backupCount=Conf.getconf("backup_log_count"))
            formatter = logging.Formatter(
                '%(asctime)s - %(levelname)s - %(message)s')
            rfh.setFormatter(formatter)
            selenium_logger.addHandler(rfh)

            stream_handler = logging.StreamHandler()
            stream_handler.setFormatter(formatter)
            stream_handler.setLevel(Conf.getconf("loglevel_to_stdout"))
            selenium_logger.addHandler(stream_handler)

        if self.executable_path == "":
            self.executable_path = Conf.getconf("phantomJS_pass")
        if self.service_args == None:
            self.service_args = ["--webdriver-loglevel=DEBUG"]
        if self.service_log_path == None:
            self.service_log_path = Conf.getconf("logdir") + Conf.getconf(
                "phantomjs_logfile")
        self.log.debug(__class__.__name__ + ".super().__init__ start")
        super().__init__(executable_path=self.executable_path, \
           port=self.port, desired_capabilities=self.PHANTOMJS, \
           service_args=self.service_args, service_log_path=self.service_log_path)
        self.set_page_load_timeout(Conf.getconf("phantomJS_load_timeout"))
Exemple #10
0
    def save_cheapest_pdf(self, product_name, logger=None):
        self.log.debug(__class__.__name__ + "." +
                       sys._getframe().f_code.co_name + " start.")
        self.log.debug("product_name[" + product_name + "]")
        if logger:
            self.log.debug("change logger")
            self.log = logger
            self.driver.log = logger

        print("move to kakaku.com")
        self.move_to_top_page()

        print("search product. name[" + product_name + "]")
        search_results = self.search_product(product_name)
        if len(search_results) > 1:
            self.log.warn("search results of product_name[" +
                          str(product_name) + "] = " +
                          str(len(search_results)) + " > 1.")
            self.log.warn("use only first result.")

        print("click top of search result")
        if not self.driver.click(search_results[0],
                                 warning_messages=self.warning_messages):
            self.log.error("click failed. Please retry.")
            #exit(1)
            raise Exception
        #tag = '//td[@class="fRed"]/p[@class="wordwrapTrs"]/a'
        tag = '//p[@class="wordwrapShop"]/a'
        self.driver.wait_appearance_of_tag(by="xpath", tag=tag)

        print("get cheapest vendor")
        cheapest_vendor, vendor_name = self.get_cheapest_vendor_button(
            product_name)

        print("move_to_vendor_page")
        self.move_to_vendor_page(cheapest_vendor)
        path = Conf.getconf("pdf_save_path")
        print("save as " + path + "/" + product_name + "|" + vendor_name +
              ".pdf")
        self.driver.save_current_page(path + "/" + product_name + "|" +
                                      vendor_name + ".pdf")

        self.log.debug(__class__.__name__ + "." +
                       sys._getframe().f_code.co_name + " finished.")
Exemple #11
0
    def get(self,
            url,
            tag_to_wait="",
            by="xpath",
            timeout="default",
            warning_messages=True):
        if timeout == "default":
            timeout = self.load_timeout
        retries = 10
        while retries > 0:
            try:
                self.log.debug("super().get(" + url + ") start")
                super().get(url)
                break
            except RemoteDisconnected as e:
                self.log.debug("PhantomJS caught RemoteDisconnected at get" +
                               url)
                self.log.debug("%s", e)
                self.log.debug("retries[" + str(retries) + "]")
                super().__init__(executable_path=self.executable_path, \
                  port=self.port, desired_capabilities=self.PHANTOMJS, \
                  service_args=self.service_args, service_log_path=self.service_log_path, logger=self.log)
                retries -= 1
            except TimeoutException as e:
                self.save_error_messages_at(sys._getframe().f_code.co_name,
                                            "by[" + by + "], tag[" +
                                            tag_to_wait + "]",
                                            warning_messages,
                                            e,
                                            url=url)
                self.execute_script("window.stop();")

        if retries == 0:
            self.log.error("PhantomJS caught ERROR RemoteDisconnected at get" +
                           url)
            self.save_current_page("../../var/ss/get_error.html")
            self.save_current_page("../../var/ss/get_error.png")
        wait_time = Conf.getconf("phantomJS_wait_time_per_get")
        self.log.debug("get finished. wait " + str(wait_time) + " seconds")
        time.sleep(wait_time)
        if tag_to_wait != "":
            self.wait_appearance_of_tag(by=by,
                                        tag=tag_to_wait,
                                        timeout=timeout)
class IEEEXplore:
    def __init__(self):
        sys.path.append(
            os.path.dirname(os.path.abspath(__file__)) + "/../lib/utils")
        from conf import Conf
        self.conf = Conf()
        from log import Log as l
        self.log = l.getLogger()

        self.opt = Search_options()
        self.log.debug("class " + __class__.__name__ + " created.")

    def get_papers_of_new_conferences(self, conference_num):
        self.log.info(__class__.__name__ + "." +
                      sys._getframe().f_code.co_name + "(conference_num=" +
                      str(conference_num) + ") start.")

    def get_papers_by_keywords(self,
                               keywords,
                               num_of_papers="all",
                               search_options="default",
                               path="../../data/tmp/",
                               filename="title",
                               timeout=30):
        self.log.info(__class__.__name__ + "." +
                      sys._getframe().f_code.co_name + " start")
        self.log.info("keywords[" + keywords + "], num_of_papers[" +
                      str(num_of_papers) + "]")

        driver = self.create_driver(timeout=timeout)
        if search_options == "default":
            search_options = Search_options()

        self.search_by_keywords(driver,
                                keywords,
                                search_options=search_options,
                                timeout=timeout)

        if num_of_papers == "all":
            element = driver.find_element_by_css_selector(
                'div[class="pure-u-1-1 Dashboard-header ng-scope"] > span')
            num_of_papers = int(element.text.split(" ")[-1].replace(",", ""))
        self.log.debug("num_of_papers[" + str(num_of_papers) + "]")

        urls = self.get_urls_of_papers_in_keywords_page(
            driver, search_options.PerPage, num_of_papers, timeout)
        print("urls.size[" + str(len(urls)) + "]")
        all_papers = []
        all_citing_urls = []
        all_cited_urls = []

        sys.path.append(
            os.path.dirname(os.path.abspath(__file__)) + "/../../lib/math")
        from searchs import Searchs
        search = Searchs(limit=num_of_papers)

        for url in urls:
            search.node = url
            paper, citing_urls, cited_urls = self.get_attributes_and_download_pdf(
                search, driver, path=path, filename=filename)
            print("paper.title[" + paper.title + "]")
            all_papers.append(paper)
            all_citing_urls.extend(citing_urls)
            all_cited_urls.extend(cited_urls)
            self.log.info(__class__.__name__ + "." +
                          sys._getframe().f_code.co_name + " finished")

        return all_papers, urls, all_citing_urls, all_cited_urls

    def get_papers_of_target_conference(self, conference_name):
        pass

    def create_driver(self, url="", timeout=30):
        self.log.debug(__class__.__name__ + "." +
                       sys._getframe().f_code.co_name + " start")
        self.log.debug("url[" + url + "]")

        if url == "" or url == self.conf.getconf("IEEE_top_page"):
            url = self.conf.getconf("IEEE_top_page")

        sys.path.append(
            os.path.dirname(os.path.abspath(__file__)) + "/../../lib/scraping")
        from phantomjs_ import PhantomJS_
        driver = PhantomJS_(desired_capabilities={
            'phantomjs.page.settings.resourceTimeout': timeout
        })

        self.log.debug("driver.get(" + url + ")")
        driver.get(url,
                   tag_to_wait='//li[@class="Media-articles-item"]',
                   by="xpath",
                   timeout=timeout)
        self.log.debug("driver.get finished")
        """
		if url == self.conf.getconf("IEEE_top_page"):
			self.log.debug("Wait start.")
			try:
				WebDriverWait(driver, timeout).until(lambda driver: driver.find_element_by_xpath('//li[@class="Media-articles-item"]'))
			except TimeoutException:
				self.log.warning("caught TimeoutException at load the iEEE top page.")
			except NoSuchElementException:
				self.log.warning("caught NoSuchElementException at load the iEEE top page.")

			self.log.debug("Wait Finished.")
		"""
        self.log.debug(__class__.__name__ + "." +
                       sys._getframe().f_code.co_name +
                       " finished. return driver")
        return driver

    def wait_search_results(self, driver, timeout=30):
        self.log.debug(__class__.__name__ + "." +
                       sys._getframe().f_code.co_name + " start")

        self.log.debug("Wait start.")
        try:
            tag = '//input[@type="checkbox" and @data-group="search-results-group" and @ng-checked="vm.allSelected()"]'
            WebDriverWait(driver, timeout).until(
                lambda driver: driver.find_element_by_xpath(tag))
        except TimeoutException:
            self.log.warning(
                "caught TimeoutException at load the keywords results page.")
            self.log.warning("at " + sys._getframe().f_code.co_name)
            self.log.warning("url[" + driver.current_url + "]")
            self.log.warning("tag[find_element_by_xpath(" + tag + ")")
            filename = "./samples/TimeoutExceptionatLoadtheKeywordsResultsPage." + re.sub(
                r"/|:|\?", "", driver.current_url)
            self.save_current_page(driver, filename + ".png")
            self.save_current_page(driver, filename + ".html")

        except NoSuchElementException:
            self.log.warning(
                "caught NoSuchElementException at load the keywords results page."
            )

        self.log.debug("Wait Finished.")

        self.log.debug(__class__.__name__ + "." +
                       sys._getframe().f_code.co_name + " finished")
        return 0

    def search_by_keywords(self,
                           driver,
                           keywords,
                           search_options="default",
                           timeout=30):
        self.log.debug(__class__.__name__ + "." +
                       sys._getframe().f_code.co_name + " start")
        driver.wait_appearance_of_tag(by="name",
                                      tag='queryText',
                                      timeout=timeout)
        try:
            driver.find_element_by_name('queryText').send_keys(keywords)
            driver.find_element_by_class_name('Search-submit').click()
        except (Exception) as e:
            self.log.exception('[[EXCEPTON OCCURED]]: %s', e)
            sys.exit("[[EXCEPTON OCCURED]]please check logfile.")
        self.wait_search_results(driver, timeout)

        self.set_options(driver, search_options, timeout)

        self.log.debug(__class__.__name__ + "." +
                       sys._getframe().f_code.co_name + " finished")
        return 0

    def set_options(self, driver, search_options, timeout=30):
        self.log.debug(__class__.__name__ + "." +
                       sys._getframe().f_code.co_name + " start")
        try:
            #self.save_current_page(driver, "./samples/before_set_options.png")
            #self.save_current_page(driver, "./samples/before_set_options.html")
            ##show" : "All Resul

            ##PerPage" : "25"
            if search_options.PerPage != 25:
                element = driver.find_element_by_css_selector(
                    'div[ng-model="vm.rowsPerPage"] > div > select')

                #print(len(element))
                #print(element.text)
                Select(element).select_by_visible_text(
                    str(search_options.PerPage))
                self.wait_search_results(driver, timeout)
            #Select(element).select_by_value("object:75")
            ##SortBy" : "MostCit
            ##ContentType" : "No
            ##YearType" : "Range
            ##YearFrom" : "1996"
            ##YearTo" : "2017",
            ##Year" : "2017",
            ##Author" : "None",
            ##Affiliation" : "No
            ##PublicationTitle"
            ##Publisher" : "None
            ##ConferenceLocation
        except NoSuchElementException:
            print("caught NoSuchElementException at get_citing_papers.")
            self.save_current_page(
                driver, "./samples/aNoSuchElementException_in_set_options.png")
            self.save_current_page(
                driver, "./samples/NoSuchElementException_in_set_options.html")

        #self.save_current_page(driver, "./samples/after_set_options.png")
        #self.save_current_page(driver, "./samples/after_set_options.html")

        self.log.debug(__class__.__name__ + "." +
                       sys._getframe().f_code.co_name + " finished")

    def get_urls_of_papers_in_keywords_page(self,
                                            driver,
                                            PerPage,
                                            num_of_papers="all",
                                            timeout=30):
        self.log.debug(__class__.__name__ + "." +
                       sys._getframe().f_code.co_name + " start")

        if num_of_papers == "all":
            element = driver.find_element_by_css_selector(
                'div[class="pure-u-1-1 Dashboard-header ng-scope"] > span')
            num_of_papers = int(element.text.split(" ")[-1].replace(",", ""))
        self.log.debug("num_of_papers[" + str(num_of_papers) + "]")

        urls = []

        next_button = driver.find_element_by_xpath(
            '//a[@href="" and @ng-click="selectPage(page.number)" and @class="ng-binding"]'
        )
        visited_buttons = [next_button.text]
        while True:
            self.log.debug("get paper urls in current page")
            for i in range(PerPage):
                paper_elements = driver.find_elements_by_xpath(
                    '//div[@class="js-displayer-content u-mt-1 stats-SearchResults_DocResult_ViewMore ng-scope hide"]'
                )
                self.log.debug("i[" + str(i) + "] len(paper_elements)[" +
                               str(len(paper_elements)) + "]")
                driver.execute_script(
                    "window.scrollTo(0, document.body.scrollHeight);")
                if len(paper_elements) == PerPage:
                    break
            self.log.debug("len(paper_elements)[" + str(len(paper_elements)) +
                           "]")

            for paper_element in paper_elements:
                url = paper_element.find_element_by_css_selector(
                    'a').get_attribute("href")
                self.log.debug("url[" + url + "]")
                urls.append(url)
                if len(urls) > num_of_papers:
                    self.log.debug("len(urls)[" + str(len(urls)) +
                                   "] > num_of_papers[" + str(num_of_papers) +
                                   "]. return urls.")
                    return urls

            self.log.debug("search buttons to next page")
            buttons = driver.find_elements_by_xpath(
                '//a[@href="" and @ng-click="selectPage(page.number)" and @class="ng-binding"]'
            )
            i = 0
            for button in buttons:
                self.log.debug("i[" + str(i) + "], button.text[" +
                               button.text + "], visited_buttons:" +
                               str(visited_buttons))
                if not button.text in visited_buttons:
                    next_button = button
                    self.log.debug("break")
                    break
                i += 1
            if i == len(buttons):
                self.log.debug(
                    "i = len(buttons). already visited all buttons. break")
                break

            visited_buttons.append(next_button.text)
            self.log.debug("move to next page[" + next_button.text + "]")
            next_button.click()
            self.wait_search_results(driver, timeout)

        self.log.debug(__class__.__name__ + "." +
                       sys._getframe().f_code.co_name +
                       " finished. return urls[" + str(len(urls)) + "]")
        return urls

    def get_attributes_and_download_pdf(self,
                                        search,
                                        driver,
                                        path="../../data/tmp/",
                                        filename="title"):
        print(__class__.__name__ + "." + sys._getframe().f_code.co_name +
              " start")
        self.log.info(__class__.__name__ + "." +
                      sys._getframe().f_code.co_name + " start")

        sys.path.append(
            os.path.dirname(os.path.abspath(__file__)) + "/../../lib/db")
        search.times += 1

        timeout = 30
        target_paper_url = search.node

        m = "url[" + target_paper_url + "], times[" + str(
            search.times) + "], limit[" + str(search.limit) + "]"
        print(m)
        self.log.info(m)

        ##reconnect because of http.client.RemoteDisconnected
        #if search.times % 5 == 0:
        #	driver = self.reconnect_driver(driver, driver.current_url)
        #self.save_current_page(driver, "./samples/tmp.png")

        ##if this paper already downloaded, this paper visited and skip.
        #if target_paper_url in search.visited:

        self.move_to_paper_initial_page(driver, target_paper_url)

        import table_papers
        paper = table_papers.Table_papers()

        self.log.debug("get attributes of this paper")
        paper.title = self.get_title(driver)
        paper.authors = self.get_authors(driver)
        paper.keywords = self.get_keywords(driver)
        paper.citings, citing_papers, citing_urls = self.get_citing_papers(
            driver, timeout)
        paper.citeds, cited_papers, cited_urls = self.get_cited_papers(
            driver, timeout)
        paper.conference = self.get_conference(driver)
        paper.published = self.get_date_of_publication(driver)
        paper.url = target_paper_url
        paper.timestamp = time.strftime('%Y-%m-%d %H:%M:%S')
        if filename == "title":
            filename = paper.title + ".pdf"
        paper.path = self.download_a_paper(driver,
                                           path=path,
                                           filename=filename,
                                           timeout=timeout)
        self.log.debug("download finished. wait start.")
        time.sleep(self.conf.getconf("IEEE_wait_time_per_download_paper"))
        self.log.debug("wait finished.")
        paper.id = paper.get_id()

        self.log.debug(paper.get_vars())
        paper.renewal_insert()

        self.log.debug("insert citations of this paper to db")
        import table_citations

        for citing_paper in citing_papers:
            citation = table_citations.Table_citations(start=paper.id,
                                                       end=citing_paper.id)
            citation.renewal_insert()
            citation.close()
        for cited_paper in cited_papers:
            citation = table_citations.Table_citations(start=cited_paper,
                                                       end=paper.id)
            citation.renewal_insert()
            citation.close()

        self.log.debug("check termination of searching loop")
        if 0 < search.limit and search.times >= search.limit:
            self.log.debug("search finished.")
            search.que = [search.node]
            import signal
            driver.service.process.send_signal(
                signal.SIGTERM)  # kill the specific phantomjs child proc
            driver.quit()  # quit the node proc
            return paper, [], []

        self.log.debug(__class__.__name__ + "." +
                       sys._getframe().f_code.co_name + " finished")
        self.log.debug("return paper[" + paper.title + "] citing_urls[" +
                       str(citing_urls) + "] cited_urls[" + str(cited_urls) +
                       "]")
        return paper, citing_urls, cited_urls

    def get_title(self, driver):
        return driver.title

    def get_authors(self, driver):
        authors_str = ""
        elements = driver.find_elements_by_xpath(
            '//span[@ng-bind-html="::author.name"]')

        for el in elements:
            authors_str += "," + el.text
        return authors_str[1:]

    def get_keywords(self, driver):
        ##keywords
        keywords_str = ""
        elements = driver.find_elements_by_xpath('//a[@ng-bind-html="::term"]')
        #print(str(len(elements))) #21
        for el in elements:
            keyword = el.text
            if keyword in keywords_str:
                ##todo internet concludes int
                self.log.debug("keyword[" + keyword +
                               "] is deplicated. not add.")
            else:
                keywords_str += "," + el.text
        return keywords_str

    def get_citing_papers(self, driver, timeout=30):
        ##citing_papers
        ##citing_urls
        import table_papers
        citings_str = ""
        citing_papers = []
        citing_urls = []

        try:
            elements = driver.find_elements_by_css_selector(
                'div[ng-repeat="article in vm.contextData.similar"]')
        except NoSuchElementException:
            self.log.warning(
                "caught NoSuchElementException at get_citing_papers.")

        self.log.debug(str(len(elements)))
        #self.save_current_page(driver, "./samples/sample_page_4116687_start.html")
        #self.save_current_page(driver, "./samples/sample_page_4116687_start.png")
        self.log.debug("create arrays of paper and url")

        for el in elements:
            citing_paper = table_papers.Table_papers()
            citing_paper.url = self.conf.getconf(
                "IEEE_website") + el.find_element_by_css_selector(
                    'a').get_attribute("ng-href")
            citing_paper.title = el.find_element_by_css_selector(
                'a').get_attribute("title")
            citing_paper.authors = el.find_element_by_css_selector(
                'div[class="ng-binding"]').text.replace(";", ",")
            timestamp = time.strftime('%Y-%m-%d %H:%M:%S')
            self.log.debug("citing_url[" + citing_paper.url + "]")
            self.log.debug("citing_title[" + citing_paper.title + "]")
            self.log.debug("citing_authors[" + citing_paper.authors + "]")
            self.log.debug(citing_paper.get_vars())

            citing_paper.renewal_insert()
            citing_papers.append(citing_paper)
            citing_urls.append(citing_paper.url)

        return citings_str, citing_papers, citing_urls

    def get_cited_papers(self, driver, timeout=30):
        self.log.debug(__class__.__name__ + "." +
                       sys._getframe().f_code.co_name + " start")

        import table_papers

        citeds_str = ""
        cited_papers = []
        cited_urls = []

        initial_url = driver.current_url
        driver.get(self.convert_paper_url_to_cited_url(initial_url))
        #self.save_current_page(driver, "./samples/sample_page_1055638_start.html")
        #self.save_current_page(driver, "./samples/sample_page_1055638_start.png")

        try:
            div = driver.find_element_by_css_selector(
                'div > section[class="document-all-references ng-scope"] > div[class="ng-scope"] > div[class="strong"]'
            ).text
            if div == "Citations not available for this document.":
                self.log.debug("this paper not cited. return []")
                return citeds_str, cited_papers, cited_urls
            self.log.debug("div=" + div + ", this paper is cited")
        except NoSuchElementException:
            self.log.debug("this paper is cited")

        self.log.debug(
            "WebDriverWait(driver, timeout).until(lambda driver: driver.find_element_by_xpath('//b[@class=ng-binding]' start"
        )

        try:
            WebDriverWait(
                driver,
                timeout).until(lambda driver: driver.find_element_by_xpath(
                    '//b[@class="ng-binding"]'))
        except TimeoutException:
            self.log.warning(
                "caught TimeoutException at load the first cited page.")
            self.move_to_paper_initial_page(driver, initial_url)
            return citeds_str, cited_papers, cited_urls
        except NoSuchElementException:
            self.log.warning(
                "caught NoSuchElementException at load the first cited page.")
            self.move_to_paper_initial_page(driver, initial_url)
            return citeds_str, cited_papers, cited_urls

        self.log.debug("Wait Finished.")

        #self.save_current_page(driver, "./samples/sample_page_1055638_cited.html")
        #self.save_current_page(driver, "./samples/sample_page_1055638_cited.png")

        self.log.debug("continue pushing more view button")
        elements = self.continuous_pushing_more_view_button(driver, timeout)

        self.log.debug("create arrays of paper and url")

        for el in elements:
            cited_url = self.conf.getconf(
                "IEEE_website"
            ) + el.find_element_by_css_selector(
                'div[class="ref-links-container stats-citations-links-container"] > span > a'
            ).get_attribute("ng-href")
            cited_urls.append(cited_url)
            cited_authors, cited_title, cited_conference, cited_date = self.parse_citing(
                el.find_element_by_css_selector(
                    'div[ng-bind-html="::item.displayText"]').text)
            timestamp = time.strftime('%Y-%m-%d %H:%M:%S')
            cited_paper = table_papers.Table_papers(
                title=cited_title,
                authors=cited_authors,
                conference=cited_conference,
                published=cited_date,
                url=cited_url,
                timestamp=timestamp)
            self.log.debug(cited_paper.get_vars())
            cited_paper.renewal_insert()

        #self.save_current_page(driver, "./samples/sample_page_cited_view_more.html")
        #self.save_current_page(driver, "./samples/sample_page_cited_view_more.png")

        self.move_to_paper_initial_page(driver, initial_url)
        #self.save_current_page(driver, "./samples/sample_page_initial.html")
        #self.save_current_page(driver, "./samples/sample_page_initial.png")

        self.log.debug(__class__.__name__ + "." +
                       sys._getframe().f_code.co_name + " finished")
        return citeds_str, cited_papers, cited_urls

    def continuous_pushing_more_view_button(self, driver, timeout=30):
        self.log.debug(__class__.__name__ + "." +
                       sys._getframe().f_code.co_name + " start")
        ##if not cited, load-more-button does not exist.
        ##but if cited, load-more-button always exists nevertheless no more paper,
        ##and the buttons are hidden.

        elements = driver.find_elements_by_css_selector(
            'div[ng-repeat="item in vm.contextData.paperCitations.ieee"] > div[class="pure-g pushTop10"] > div[class="pure-u-23-24"]'
        )
        num_of_viewing = len(elements)
        limit_of_view = self.conf.getconf("IEEE_citation_num_at_first_page")
        self.log.debug("num_of_viewing[" + str(num_of_viewing) +
                       "], limit_of_view[" + str(limit_of_view) + "]")

        while num_of_viewing > limit_of_view - 10:
            limit_of_view += self.conf.getconf(
                "IEEE_citation_num_per_more_view")
            try:
                load_more_button = driver.find_element_by_xpath(
                    '//button[@class="load-more-button" and @ng-click="vm.loadMoreCitations(\'ieee\')"]'
                )
                load_more_button.click()

                WebDriverWait(
                    driver, timeout
                ).until(lambda driver: driver.find_element_by_xpath(
                    '//button[@class="load-more-button" and @ng-click="vm.loadMoreCitations(\'ieee\')" and @aria-disabled="false"]'
                ))
            except TimeoutException:
                m = "caught TimeoutException at loading more cited pages(" + str(
                    limit_of_view) + ") paper[" + driver.current_url + "]."
                print(m)
                self.log.warning(m)
            except NoSuchElementException:
                m = "caught NoSuchElementException at loading more cited pages(" + str(
                    limit_of_view) + ") paper[" + driver.current_url + "]."
                print(m)
                self.log.warning(m)
            except ElementNotVisibleException:
                m = "caught ElementNotVisibleException at loading more cited pages(" + str(
                    limit_of_view
                ) + ") paper[" + driver.current_url + "]. break."
                self.log.debug(m)
                break

            elements = driver.find_elements_by_css_selector(
                'div[ng-repeat="item in vm.contextData.paperCitation"]')
            num_of_viewing = len(elements)
            self.log.debug("num_of_viewing[" + str(num_of_viewing) +
                           "], limit_of_view[" + str(limit_of_view) + "]")

        self.log.debug(__class__.__name__ + "." +
                       sys._getframe().f_code.co_name + " finished")
        return elements

    def get_conference(self, driver):
        try:
            return driver.find_element_by_xpath('//div[@class="u-pb-1 stats-document-abstract-doi ng-scope"]')\
                .find_element_by_tag_name('a').text
        except NoSuchElementException:
            return ""

    def get_date_of_publication(self, driver):
        #Date of Publication: 06 January 200 or Date of Conference 14-16 Nov. 2006
        try:
            date = driver.find_element_by_xpath(
                '//div[@ng-if="::vm.details.isJournal == true"]').text
            return self.convert_date_of_publication_to_datetime(date)
        except NoSuchElementException:
            try:
                date = driver.find_element_by_xpath(
                    '//div[@ng-if="::vm.details.isConference == true"]').text
                return self.convert_date_of_publication_to_datetime(date)
            except NoSuchElementException:
                self.log.debug("caught NoSuchElementException. date = None"
                               )  ##todo get from paper??
                driver.save_current_page(
                    "./samples/caughtNoSuchElementExceptionatdate_of_publication.png"
                )
                driver.save_current_page(
                    "./samples/caughtNoSuchElementExceptionatdate_of_publication.html"
                )
                return None

    def move_to_paper_initial_page(self, driver, initial_url, timeout=30):
        self.log.debug(__class__.__name__ + "." +
                       sys._getframe().f_code.co_name + " start")
        driver.get(initial_url)
        self.log.debug(
            "WebDriverWait(driver, timeout).until(lambda driver: driver.find_element_by_xpath('//div[@ng-repeat=\"article in vm.contextData.similar\"]'))"
        )
        try:
            WebDriverWait(
                driver,
                timeout).until(lambda driver: driver.find_element_by_xpath(
                    '//div[@ng-repeat="article in vm.contextData.similar"]'))
        except TimeoutException:
            self.log.warning(
                "caught TimeoutException at load the paper top page.")
        except NoSuchElementException:
            self.log.warning(
                "caught NoSuchElementException at load the paper top page.")

        self.log.debug("Wait Finished.")

        self.log.debug(__class__.__name__ + "." +
                       sys._getframe().f_code.co_name + " finished")

    def wait_button_to_pdf_page(self, driver, timeout=30):
        self.log.debug(__class__.__name__ + "." +
                       sys._getframe().f_code.co_name + " start")
        self.log.debug("Wait start.")
        try:
            WebDriverWait(driver, timeout).until(
                lambda driver: driver.find_element_by_css_selector(
                    'i[class="icon doc-act-icon-pdf"]'))
        except TimeoutException:
            self.log.warning(
                "caught TimeoutException at waiting button which go to pdf page."
            )
        except NoSuchElementException:
            self.log.warning(
                "caught NoSuchElementException at waiting button which go to pdf page."
            )
        self.log.debug("Wait Finished.")
        self.log.debug(__class__.__name__ + "." +
                       sys._getframe().f_code.co_name + " finished")

    def download_a_paper(self,
                         driver,
                         path="../../data/tmp/",
                         filename="default",
                         timeout=30):
        self.log.debug(__class__.__name__ + "." +
                       sys._getframe().f_code.co_name + " start")
        initial_url = driver.current_url

        m = "downloading paper to " + path + ". title[" + filename + "]"
        self.log.info(m)
        print(m)

        self.wait_button_to_pdf_page(driver, timeout)

        button = driver.find_element_by_css_selector(
            'i[class="icon doc-act-icon-pdf"]')

        retries = 10
        while retries > 0:
            try:
                button.click()
                self.log.debug("clicked button and no exception. break")
                break
            except (RemoteDisconnected, ConnectionRefusedError, URLError) as e:
                self.log.warning("caught " + e.__class__.__name__ +
                                 " at click download pdf button. retries[" +
                                 str(retries) + "]")
                self.log.warning(e, exc_info=True)
                time.sleep(
                    self.conf.getconf("IEEE_wait_time_per_download_paper"))
                driver.reconnect(initial_url)
                self.wait_button_to_pdf_page(driver, timeout)
                button = driver.find_element_by_css_selector(
                    'i[class="icon doc-act-icon-pdf"]')
                retries -= 1
            except NoSuchElementException:
                self.log.warning(
                    "caught NoSuchElementException at click download pdf button. retries["
                    + str(retries) + "]")
                self.save_current_page(
                    driver,
                    "./samples/caught_NoSuchElementException_at_click_download_pdf_button.html"
                )
                self.save_current_page(
                    driver,
                    "./samples/caught_NoSuchElementException_at_click_download_pdf_button.png"
                )
                retries -= 1
        if retries == 0:
            self.log.error("button.click() error")
            self.save_current_page(driver, "./samples/button_click_error.html")
            self.save_current_page(driver, "./samples/button_click_error.png")

        self.log.debug("Wait start.")
        try:
            WebDriverWait(
                driver,
                timeout).until(lambda driver: driver.find_element_by_xpath(
                    '//frameset[@rows="65,35%"]/frame'))
        except TimeoutException:
            self.log.warning(
                "caught TimeoutException at load the iEEE pdf page.")
            self.log.warning("skip to download pdf. reuturn \"\"")
            driver.get(initial_url)
            return ""
        except NoSuchElementException:
            self.log.warning(
                "caught NoSuchElementException at load the iEEE pdf page.")
            self.log.warning("skip to download pdf. reuturn \"\"")
            driver.get(initial_url)
            return ""
        self.log.debug("Wait Finished.")
        url = driver.find_elements_by_xpath(
            '//frameset[@rows="65,35%"]/frame')[1].get_attribute("src")
        self.log.debug("url:" + url)

        if filename == "default":
            filename = url[:url.index("?")].split("/")[-1]
        filename = filename.replace(":", "")
        self.log.debug("filename:" + filename)
        command = "wget -p \"" + url + "\" -O \"" + path + filename + "\" > /dev/null 2>&1"
        #command = "wget -p \"" + url + "\" -O \"" + path + filename + "\" 1> /dev/null 2>&1"
        #command = "wget -p \"" + url + "\" -O \"" + path + filename + "\""
        self.log.debug(command)
        try:
            self.log.debug(os.system(command))
        except:
            self.log.warning("error at " + command)

        #self.save_current_page(driver, "./samples/7898372.png")
        #self.save_current_page(driver, "./samples/7898372.html")

        driver.get(initial_url)

        self.log.debug(__class__.__name__ + "." +
                       sys._getframe().f_code.co_name + " finished")
        self.log.debug("return[" + path + filename + "]")
        return path + filename

    def convert_date_of_publication_to_datetime(self, string):
        ##from
        ##Date of Publication: 06 January 2016
        ##to
        ##2016-01-06
        ##from
        ##Date of Conference: 14-16 Nov. 2006
        ##to
        ##2006-11-14
        ##from
        ##Date of Conference: 27 June-2 July 2016
        ##to
        ##2016-06-27
        ##from
        ##Date of Publication: N/A 2016
        ##to
        ##2016-01-01
        self.log.debug(__class__.__name__ + "." +
                       sys._getframe().f_code.co_name + " start")
        self.log.debug("string: " + string)
        date = ""
        month = ""
        year = ""
        string = string.replace("\n", "")
        tmp = string.split(":")
        if len(tmp) != 2:
            self.log.warning("len(tmp) != 2")
            self.log.warning("string:" + string)
            return None
        date_month_year = tmp[1].lstrip()
        self.log.debug("date_month_year[" + date_month_year + "]")
        tmp2 = date_month_year.split("-")
        if len(tmp2) >= 3:
            self.log.warning("len(tmp2) >= 3")
            self.log.warning("string:" + string)
            return None
        elif len(tmp2) == 2:
            if re.match("^\d{1,2}$", tmp2[0]):
                date = tmp2[0]
            elif re.match("^\d{1,2}\s[a-zA-Z]", tmp2[0]):
                tmp3 = tmp2[0].split(" ")
                date = tmp3[0]
                month = tmp3[1].replace(".", "")
        tmp4 = date_month_year.split(" ")
        if len(tmp4) < 3:
            self.log.debug("only year")
            self.log.debug("string:" + string)
            date = "1"
            month = "Jan"
        if date == "":
            date = tmp4[-3]
        if month == "":
            month = tmp4[-2].replace(".", "")
        if year == "":
            year = tmp4[-1]

        import datetime
        try:
            month = str(datetime.datetime.strptime(month, '%B').month)
        except ValueError:
            try:
                month = str(datetime.datetime.strptime(month, '%b').month)
            except ValueError:
                if month == "Sept":
                    month = "9"
                else:
                    self.log.warning("ValueError")
                    self.log.warning("string:" + string)
                    self.log.warning("month = 0")
                    month = "0"

        self.log.debug("year[" + year + "], month[" + month + "], date[" +
                       date + "]")

        timestamp = datetime.date(int(year), int(month), int(date))
        return timestamp

    def convert_paper_url_to_cited_url(self, url):
        self.log.debug(__class__.__name__ + "." +
                       sys._getframe().f_code.co_name + " start")
        #from
        #http://ieeexplore.ieee.org/document/4116687/?reload=true
        #to
        #http://ieeexplore.ieee.org/document/4116687/citations?anchor=anchor-paper-citations-ieee&ctx=citations
        self.log.debug("url[" + url + "]")
        converted_url = url.split("?")[
            0] + "citations?anchor=anchor-paper-citations-ieee&ctx=citations"
        self.log.debug("converted_url[" + converted_url + "]")

        return converted_url

    def convert_paper_url_to_pdf_url(self, url):
        self.log.debug(__class__.__name__ + "." +
                       sys._getframe().f_code.co_name + " start")
        ##from
        ##http://ieeexplore.ieee.org/document/6324382/
        ##to
        ##http://ieeexplore.ieee.org/ielx7/35/7901458/07901477.pdf?tp=&arnumber=7901477&isnumber=7901458
        print("url[" + url + "]")

    def parse_citing(self, strings):
        self.log.debug(__class__.__name__ + "." +
                       sys._getframe().f_code.co_name + " start")
        self.log.debug("src_str[" + strings + "]")
        #from
        #Daniel Garant, Wei Lu, "Mining Botnet Behaviors on the Large-Scale Web Application Community", Advanced Information Networking and Applications Workshops (WAINA) 2013 27th International Conference on, pp. 185-190, 2013.
        #to
        #Daniel Garant, Wei Lu,
        #Mining Botnet Behaviors on the Large-Scale Web Application Community
        #Advanced Information Networking and Applications Workshops (WAINA) 2013 27th International Conference on
        #pp. 185-190, 2013
        array = strings.split("\"")
        if len(array) < 3:
            self.log.warning(__class__.__name__ + "." +
                             sys._getframe().f_code.co_name + " warning")
            self.log.warning("strings[" + strings + "]")
            self.log.warning("len(array)(" + str(len(array)) +
                             ") < 3. return \"\", \"\", \"\", \"\"")
            return "", "", "", ""

        authors = array[0]
        title = array[1]
        new_array = array[2][1:].split(",")
        self.log.debug("new_array:" + str(new_array))
        self.log.debug(len(new_array))
        if len(new_array) < 3:
            self.log.warning(__class__.__name__ + "." +
                             sys._getframe().f_code.co_name + " warning")
            self.log.warning("strings[" + strings + "]")
            self.log.warning("len(new_array)(" + str(len(new_array)) +
                             ") < 3. return authors, title, \"\", \"\"")
            return authors, title, "", ""

        elif len(new_array) == 3:
            conference, page, year = new_array
        elif len(new_array) == 4:
            conference, vol, page, year = new_array
        elif len(new_array) == 5:
            conference, vol, page, year, issn = new_array
        else:
            self.log.warning(__class__.__name__ + "." +
                             sys._getframe().f_code.co_name + " warning")
            self.log.warning("strings[" + strings + "]")
            self.log.warning("len(new_array)(" + str(len(new_array)) +
                             ") > 5. return authors, title, \"\", \"\"")
            return authors, title, "", ""
        #self.log.debug("re.match(\"\d*\", " + year + ")")
        #year = re.match("*\d*",year).group() + "-01-01 00:00:00"
        #year += "-01-01 00:00:00"
        self.log.debug("citing year is none")
        year = None
        self.log.debug("authors[" + str(authors) + "], title[" + str(title) +
                       "], conference[" + str(conference) + "], year[" +
                       str(year) + "]")
        return authors, title, conference, year

    def reconnect_driver(self, driver, url):
        self.log.debug("driver reconnect")
        import signal
        driver.service.process.send_signal(
            signal.SIGTERM)  # kill the specific phantomjs child proc
        driver.quit()  # quit the node proc
        driver = self.create_driver(url)
        return driver

    ## for debug
    def print_h2_attributes(self, driver):
        self.log.debug(__class__.__name__ + "." +
                       sys._getframe().f_code.co_name + " start")
        links = driver.find_elements_by_tag_name("h2")
        for link in links:
            print(link.text)
        self.log.debug(__class__.__name__ + "." +
                       sys._getframe().f_code.co_name + " finished")

    def save_current_page(self, driver, filename):
        self.log.warning(__class__.__name__ + "." +
                         sys._getframe().f_code.co_name + " start")
        self.log.warning("this method will be removed.")
        self.log.warning("please use driver.save_current_page(filename)")

        path, suffix = os.path.splitext(filename)
        self.log.debug("path[" + path + "], suffix[" + suffix + "]")
        if suffix == ".html":
            f = open(filename, 'w')
            f.write(driver.page_source)
            f.close()
        elif suffix == ".png":
            driver.save_screenshot(filename)
        else:
            self.log.error("TYPEERROR suffix[" + suffix + "]")
        self.log.debug(__class__.__name__ + "." +
                       sys._getframe().f_code.co_name + " finished")

    def show_options(self):
        self.log.debug(__class__.__name__ + "." +
                       sys._getframe().f_code.co_name + " start")
        self.opt.show_options()
        self.log.debug(__class__.__name__ + "." +
                       sys._getframe().f_code.co_name + " finished")
Exemple #13
0
class Table_papers(Base):
    __tablename__ = 'papers'

    id = Column("id", INTEGER, primary_key=True)
    title = Column("title", TEXT)
    authors = Column("authors", TEXT)
    keywords = Column("keywords", TEXT)
    citings = Column("citings", MEDIUMTEXT)
    citeds = Column("citeds", MEDIUMTEXT)
    conference = Column("conference", TINYTEXT)
    published = Column("published", DATE)
    url = Column("url", TINYTEXT)
    abstract_path = Column("abstract_path", TEXT)
    pdf_path = Column("pdf_path", TEXT)
    timestamp = Column("timestamp", DATETIME)
    label = Column("label", TINYTEXT)
    color = Column("color", TINYTEXT)

    def __init__(self,
                 id="",
                 title="",
                 authors="",
                 keywords="",
                 citings="",
                 citeds="",
                 conference="",
                 published="",
                 url="",
                 timestamp="",
                 abstract_path="",
                 pdf_path="",
                 label="",
                 color=""):
        sys.path.append(
            os.path.dirname(os.path.abspath(__file__)) + "/../lib/utils")
        from conf import Conf
        self.conf = Conf()
        from log import Log as l
        self.log = l.getLogger()
        self.log.debug(__class__.__name__ + "." +
                       sys._getframe().f_code.co_name + " start")

        import mysql_operator
        self.db = mysql_operator.Mysql_operator()

        self.id = id
        self.title = title
        self.authors = authors
        self.keywords = keywords
        self.citings = citings
        self.citeds = citeds
        self.conference = conference
        if published == "":
            self.published = None
        else:
            self.published = published
        self.url = url
        if timestamp == "":
            self.timestamp = None
        else:
            self.timestamp = timestamp
        self.abstract_path = abstract_path
        self.pdf_path = pdf_path
        self.label = label
        self.color = color

    def __repr__(self):
        return 'Table_papers'

    def insert(self):
        if self.id == "":
            self.id = self.get_id()
        vars_to_encode = [
            "title", "authors", "keywords", "abstract_path", "pdf_path"
        ]
        for var in vars_to_encode:
            if eval("self." + var) is not None:
                exec("self." + var + " = self." + var +
                     ".encode('utf-8', 'replace')")
        self.db.insert(self)
        for var in vars_to_encode:
            if eval("self." + var) is not None:
                exec("self." + var + " = self." + var +
                     ".decode('utf-8', 'replace')")

        self.db.session.expunge(self)
        self.close()

    def has_already_downloaded(self):
        self.log.debug(__class__.__name__ + "." +
                       sys._getframe().f_code.co_name + " start")
        self.log.debug("paper.title[" + self.title + "]")
        if self.conf.getconf("IEEE_paper_download_period") <= 0:
            self.log.debug("IEEE_paper_download_period <= 0, return False")
            return False
        records = self.db.session.query(__class__).filter(
            __class__.title == self.title.encode('utf-8', 'replace')).all()
        if len(records) == 0:
            self.log.debug("This paper doesnt exist in db. return false")
            return False
        elif len(records) >= 2:
            self.log.warning("need to merge records")
            self.log.warning("title[" + self.title + "], len(records)[" +
                             str(len(records)) + "]")

        self.log.debug("This paper exist in db. Number of records is [" +
                       str(len(records)) + "]")
        if records[0].abstract_path == "":
            self.log.debug("but the abstract not downloaded. return False")
            return False
        self.log.debug(
            "and the abstract already downloaded. compare timestamps")

        limit = datetime.datetime.now() - timedelta(
            days=self.conf.getconf("IEEE_paper_download_period"))
        self.log.debug("limit[" + str(limit) + "], records[" +
                       str(records[0].timestamp) + "]")
        if limit > records[0].timestamp:
            self.log.debug("should renew db. return false")
            return False
        else:
            self.log.debug("recently downloaded. clone paper and return true")
            clone_vars = [
                "authors", "keywords", "citings", "citeds", "conference",
                "published", "url", "timestamp", "abstract_path", "pdf_path",
                "label", "color"
            ]
            for var in clone_vars:
                exec("self." + var + "= records[0]." + var)
            self.close()
            return True

        self.log.debug(__class__.__name__ + "." +
                       sys._getframe().f_code.co_name + " finished")

    def renewal_insert(self):
        self.log.info(__class__.__name__ + "." +
                      sys._getframe().f_code.co_name + " start")

        #check duplication and insert
        records = self.db.session.query(__class__).filter(
            __class__.title == self.title.encode('utf-8')).all()
        if len(records) == 0:  #new record
            self.insert()
            return 0

        merge_id_list = []
        for record in records:
            merge_id_list.append(record.id)

        vars = [
            "authors", "keywords", "citings", "citeds", "conference",
            "published", "url", "abstract_path", "pdf_path", "label", "color"
        ]
        for var in vars:
            for record in records:
                self.log.debug("record.id[" + str(record.id) + "]")
                self.log.debug("var[" + var + "], self[" +
                               str(eval("self." + var)) + "], record[" +
                               str(eval("record." + var)) + "]")
                tmp_timestamp = self.timestamp

                if eval("record." + var) == None or eval("record." +
                                                         var) == "":
                    self.log.debug("records." + var + " == None")
                elif eval("self." + var) == None or eval("self." + var) == "":
                    self.log.debug("self." + var + " == None")
                    #tmp = eval("self." + var)
                    #tmp = eval("record." + var)
                    exec("self." + var + " = record." + var)
                    self.log.debug("->var[" + var + "], self[" +
                                   str(eval("self." + var)) + "], record[" +
                                   str(eval("record." + var)) + "]")
                    tmp_timestamp = record.timestamp
                else:
                    self.log.debug(var + " is not none. compare timestamps")
                    ## todo: check type(timestamp)
                    if tmp_timestamp == None or record.timestamp == None or self.compare_timestamps(
                            old=tmp_timestamp, new=record.timestamp):
                        ##if record.timestamp is newer
                        exec("self." + var + " = record." + var)
                        self.log.debug("->var[" + var + "], self[" +
                                       str(eval("self." + var)) +
                                       "], record[" +
                                       str(eval("record." + var)) + "]")
                        tmp_timestamp = record.timestamp
                    #except:
                    #m = "caught exception at tmp_timestamp[" + str(tmp_timestamp) + "] < record.timestamp[" + str(record.timestamp) + "]"
                    #self.log.warning(m)
                    #print(m)

        for record in records:
            self.db.delete(record)
        self.id = self.get_id()
        import time
        self.timestamp = time.strftime('%Y-%m-%d %H:%M:%S')
        self.insert()

        ##merge citations
        self.log.debug("merge[" + str(merge_id_list) + "] to self.id[" +
                       str(self.id) + "]")
        for merge_id in merge_id_list:
            from table_citations import Table_citations
            from sqlalchemy import and_, or_
            merge_records = self.db.session.query(Table_citations).filter(
                or_(Table_citations.start == merge_id,
                    Table_citations.end == merge_id)).all()
            self.log.debug("id[" + str(merge_id) + "].records[" +
                           str(len(merge_records)) + "]")
            for merge_record in merge_records:
                self.merge_citations(merge_record,
                                     merge_id_list,
                                     survival_id=self.id,
                                     delete_id=merge_id)

        self.close()

    def merge_citations(self, merge_record, merge_id_list, survival_id,
                        delete_id):
        self.log.debug(__class__.__name__ + "." +
                       sys._getframe().f_code.co_name + " start")
        self.log.debug("from[" + str(merge_record.start) + "]to[" +
                       str(merge_record.end) + "]")
        self.log.debug("survival_id[" + str(survival_id) + "], delete_id[" +
                       str(delete_id) + "]")
        from table_citations import Table_citations

        ##is delete_id start or end?
        if merge_record.start in merge_id_list and merge_record.end in merge_id_list:
            self.log.debug("start[" + str(merge_record.start) + "] and end[" +
                           str(merge_record.end) + "] are merge_id. delete.")
            self.log.debug("delete(merge_record)")
            self.db.delete(merge_record)

        #elif merge_record.start == delete_id and not merge_record.end in merge_id_list:
        elif merge_record.start == delete_id:
            self.log.debug("start[" + str(delete_id) + "] is delete_id. end[" +
                           str(merge_record.end) + "]")
            self.log.debug("delete(merge_record)")
            self.db.delete(merge_record)
            citation = Table_citations(start=survival_id, end=merge_record.end)
            citation.renewal_insert()
            citation.close()
        #elif merge_record.end == delete_id and not merge_record.end in merge_id_list:
        elif merge_record.end == delete_id:
            self.log.debug("end[" + str(merge_record.end) +
                           "] is delete_id. start[" + str(merge_record.start) +
                           "]")
            citation = Table_citations(start=merge_record.start,
                                       end=survival_id)
            self.log.debug("delete(merge_record)")
            self.db.delete(merge_record)
            citation.renewal_insert()
            citation.close()

    def compare_timestamps(self, old, new):
        self.log.debug("compare old_timestamp[" + str(old) + "] < new[" +
                       str(new) + "]?")
        old_str = str(old)
        new_str = str(new)
        if old_str < new_str:
            self.log.debug("return true")
            return True
        else:
            self.log.debug("return false")
            return False

    def get_citings_array(self):
        return self.citings.split(",")

    def get_citeds_array(self):
        return self.citeds.split(",")

    def get_id(self):
        self.log.debug(__class__.__name__ + "." +
                       sys._getframe().f_code.co_name + " start")

        ##when the records which have same title exist,
        ##the id is smallest one of records.
        records = self.db.session.query(__class__).filter(
            __class__.title == self.title.encode('utf-8')).all()
        if len(records) == 0:  #new record
            return self._get_available_id()

        id = records[0].id
        for record in records:
            if id > record.id:
                id = record.id
        return id

    def _get_available_id(self):
        self.log.debug(__class__.__name__ + "." +
                       sys._getframe().f_code.co_name + " start")
        previous_id = 0
        for q in self.db.session.query(__class__).order_by(__class__.id):
            if q.id - previous_id >= 2:
                self.log.debug("id[" + str(q.id) + "] - previous_id[" +
                               str(previous_id) + "] > 2. return " +
                               str(previous_id + 1))
                return previous_id + 1
            previous_id = q.id
        self.log.debug("for loop ended. return " + str(previous_id + 1))
        return previous_id + 1

    def close(self):
        self.db.session.close()
        self.db.close()

    def get_vars(self):
        return ("{" + "id: " + str(self.id) + ", " + "title: " + self.title +
                ", " + "authors: " + self.authors + ", " + "keywords: " +
                self.keywords + ", " + "citings: " + self.citings + ", " +
                "citeds: " + self.citeds + ", " + "conference: " +
                self.conference + ", " + "published: " + str(self.published) +
                ", " + "url: " + self.url + ", " + "timestamp: " +
                str(self.timestamp) + ", " + "abstract_path: " +
                self.abstract_path + ", " + "pdf_path: " + self.pdf_path +
                ", " + "label: " + self.label + ", " + "color: " + self.color +
                ", " + "}")
class IEEEXplore:
    def __init__(self):
        sys.path.append(
            os.path.dirname(os.path.abspath(__file__)) + "/../lib/utils")
        from conf import Conf
        self.conf = Conf()
        from log import Log as l
        self.log = l.getLogger()

        self.opt = Search_options()
        self.log.debug("class " + __class__.__name__ + " created.")

    def get_papers_of_new_conferences(self, conference_num):
        self.log.info(__class__.__name__ + "." +
                      sys._getframe().f_code.co_name + "(conference_num=" +
                      str(conference_num) + ") start.")

    def get_papers_by_keywords(self, keywords, num_of_papers):
        self.log.info(__class__.__name__ + "." +
                      sys._getframe().f_code.co_name + " start")
        self.log.info("keywords[" + keywords + "], num_of_papers[" +
                      str(num_of_papers) + "]")

        driver = self.create_driver()
        self.search_by_keywords(driver, keywords)
        urls = self.get_urls_of_papers_in_keywords_page(driver, num_of_papers)
        all_papers = []
        all_citing_urls = []
        all_cited_urls = []
        """
		for url in urls:
			driver.get(url)
			paper, citing_urls, cited_urls = self.get_attributes_and_download_pdf(driver)
			all_papers.append(paper)
			all_citing_urls.append(citing_urls)
			all_cited_urls = (cited_urls)
			self.log.info(__class__.__name__ + "." + sys._getframe().f_code.co_name + " finished")
		"""
        return all_papers, all_cited_urls, all_citing_urls

    def get_papers_of_target_conference(self, conference_name):
        pass

    def create_driver(self, top_page=""):
        self.log.debug(__class__.__name__ + "." +
                       sys._getframe().f_code.co_name + " start")

        phantomjs_path = self.conf.getconf("phantomJS_pass")
        if top_page == "":
            top_page = self.conf.getconf("IEEE_top_page")
        from selenium import webdriver
        driver = webdriver.PhantomJS(phantomjs_path)
        self.log.debug("driver.get(" + top_page + ")")
        driver.get(top_page)

        self.log.debug(__class__.__name__ + "." +
                       sys._getframe().f_code.co_name +
                       " finished. return driver")
        return driver

    def search_by_keywords(self, driver, keywords):
        self.log.debug(__class__.__name__ + "." +
                       sys._getframe().f_code.co_name + " start")
        try:
            driver.find_element_by_name('queryText').send_keys(keywords)
            driver.find_element_by_class_name('Search-submit').click()
        except (Exception) as e:
            self.log.exception('[[EXCEPTON OCCURED]]: %s', e)
            sys.exit("[[EXCEPTON OCCURED]]please check logfile.")
        self.log.debug(__class__.__name__ + "." +
                       sys._getframe().f_code.co_name + " finished")

    def set_options(self):
        pass

    def get_urls_of_papers_in_keywords_page(self, driver, num_of_papers):
        self.log.debug(__class__.__name__ + "." +
                       sys._getframe().f_code.co_name + " start")
        urls = []
        links = driver.find_elements_by_class_name("pure-u-22-24")
        self.log.debug("len(links)[" + str(len(links)) + "]")

        for link in links:
            element = link.find_element_by_css_selector("h2 > a")
            urls.append(element.get_attribute("href"))

        self.log.debug(__class__.__name__ + "." +
                       sys._getframe().f_code.co_name + " finished. return " +
                       str(urls))
        return urls

    def get_attributes_and_download_pdf(self, search, driver):
        self.log.debug(__class__.__name__ + "." +
                       sys._getframe().f_code.co_name + " start")

        sys.path.append(
            os.path.dirname(os.path.abspath(__file__)) + "/../../lib/db")
        timeout = 30
        target_paper_url = search.node

        self.log.info("url[" + target_paper_url + "], times[" +
                      str(search.times) + "], limit[" + str(search.limit) +
                      "]")
        #if this paper already downloaded, thid paper visited and skip.

        driver.get(target_paper_url)
        self.log.debug(
            "WebDriverWait(driver, timeout).until(lambda driver: driver.find_element_by_xpath('//div[@ng-repeat=\"article in vm.contextData.similar\"]'))"
        )

        try:
            WebDriverWait(
                driver,
                timeout).until(lambda driver: driver.find_element_by_xpath(
                    '//div[@ng-repeat="article in vm.contextData.similar"]'))
        except TimeoutException:
            m = "caught TimeoutException at load the paper top page."
            print(m)
            self.log.warning(m)
        except NoSuchElementException:
            m = "caught NoSuchElementException at load the paper top page."
            print(m)
            self.log.warning(m)

        self.log.debug("Wait Finished.")

        import table_papers
        paper = table_papers.Table_papers()

        self.log.debug("get attributes of this paper")
        #paper.title = self.get_title(driver)
        #paper.authors = self.get_authors(driver)
        #paper.keywords = self.get_keywords(driver)
        #citing_urls = []
        paper.citings, citing_papers, citing_urls = self.get_citing_papers(
            driver, timeout)
        cited_urls = []
        #paper.citeds, cited_papers, cited_urls = self.get_cited_papers(driver, timeout)
        #paper.conference = self.get_conference(driver)
        #paper.published = self.get_date_of_publication(driver)
        #paper.url = target_paper_url
        import time
        paper.timestamp = time.strftime('%Y-%m-%d %H:%M:%S')
        ##path

        #paper.renewal_insert()
        print(paper.get_vars())

        self.log.debug("insert citations of this paper to db")
        import table_citations

        for citing_paper in citing_papers:
            citation = table_citations.Table_citations(start=paper.id,
                                                       end=citing_paper.id)
            citation.renewal_insert()
        self.log.debug(__class__.__name__ + "." +
                       sys._getframe().f_code.co_name + " finished")
        return paper, citing_urls, cited_urls

    def get_title(self, driver):
        #element = driver.find_element_by_tag_name("title")
        #element = driver.find_element_by_id("title")
        #element = driver.find_element_by_css_selector("html > title")
        #element = driver.find_element_by_class_name("title")
        return driver.title

    def get_authors(self, driver):
        ##authors
        #<span ng-bind-html="::author.name" class="ng-binding">
        #elements = driver.find_elements_by_class_name("authors-container")
        #print(str(len(elements)))
        authors_str = ""
        elements = driver.find_elements_by_xpath(
            '//span[@ng-bind-html="::author.name"]')
        #print(str(len(elements))) #5
        for el in elements:
            authors_str += "," + el.text
        return authors_str[1:]

    def get_keywords(self, driver):
        ##keywords
        keywords_str = ""
        elements = driver.find_elements_by_xpath('//a[@ng-bind-html="::term"]')
        #print(str(len(elements))) #21
        for el in elements:
            keyword = el.text
            if keyword in keywords_str:
                ##todo internet concludes int
                self.log.debug("keyword[" + keyword +
                               "] is deplicated. not add.")
            else:
                keywords_str += "," + el.text
        return keywords_str

    def get_citing_papers(self, driver, timeout=30):
        ##citing_papers
        ##citing_urls
        """
					<a ng-href="/document/4116687" title="Usilng Machine Learning Technliques to Identify Botnet Traffic" target="_self" href="/document/4116687">
				<span ng-bind-html="::(vm.contextData.isStandard ? article.standardNumber + ' - ' + article.title : article.title) | charlimitHtml:185" mathjax-bind="" class="ng-binding">Usilng Machine Learning Technliques to Identify Botnet Traffic</span>
			</a>
			<div class="ng-binding">Carl Livadas; Robert Walsh; David Lapsley; W. Timothy Strayer</div>
		</div><!-- end ngRepeat: article in vm.contextData.similar --><div class="doc-all-related-articles-list-item ng-scope" ng-repeat="article in vm.contextData.similar"> 
		"""
        import table_papers
        citings_str = ""
        citing_papers = []
        citing_urls = []

        self.log.debug(
            "WebDriverWait(driver, timeout).until(lambda driver: driver.find_elements_by_css_selector('div[ng-repeat=\"article in vm.contextData.similar\"] > a')) start"
        )

        try:
            WebDriverWait(driver, timeout).until(
                lambda driver: driver.find_elements_by_css_selector(
                    'div[ng-repeat="article in vm.contextData.similar"] > a'))
        except TimeoutException:
            m = "caught TimeoutException at load the paper top page."
            print(m)
            self.log.warning(m)
            return citings_str, citing_papers, citing_urls
        except NoSuchElementException:
            m = "caught NoSuchElementException at load the paper top page."
            print(m)
            self.log.warning(m)
            return citings_str, citing_papers, citing_urls

        self.log.debug("Wait Finished.")

        elements = driver.find_elements_by_css_selector(
            'div[ng-repeat="article in vm.contextData.similar"]')

        print(str(len(elements)))
        self.save_current_page(driver,
                               "./samples/sample_page_4116687_start.html")
        self.save_current_page(driver,
                               "./samples/sample_page_4116687_start.png")
        print("create arrays of paper and url")

        for el in elements:
            citing_paper = table_papers.Table_papers()
            citing_paper.url = self.conf.getconf(
                "IEEE_website") + el.find_element_by_css_selector(
                    'a').get_attribute("ng-href")
            citing_paper.title = el.find_element_by_css_selector(
                'a').get_attribute("title")
            citing_paper.authors = el.find_element_by_css_selector(
                'div[class="ng-binding"]').text.replace(";", ",")
            import time
            timestamp = time.strftime('%Y-%m-%d %H:%M:%S')
            print("citing_url[" + citing_paper.url + "]")
            print("citing_title[" + citing_paper.title + "]")
            print("citing_authors[" + citing_paper.authors + "]")
            print(citing_paper.get_vars())

            citing_paper.renewal_insert()
            citing_papers.append(citing_paper)
            citing_urls.append(citing_paper.url)

        return citings_str, citing_papers, citing_urls

    def get_cited_papers(self, driver, timeout=30):
        self.log.debug(__class__.__name__ + "." +
                       sys._getframe().f_code.co_name + " start")

        import table_papers

        citeds_str = ""
        cited_papers = []
        cited_urls = []

        #href="/document/4116687/citations?tabFilter=papers"
        #http://ieeexplore.ieee.org/document/4116687/citations?anchor=anchor-paper-citations-ieee&ctx=citations
        initial_url = driver.current_url
        driver.get(self.convert_paper_url_to_cited_url(initial_url))
        self.save_current_page(driver,
                               "./samples/sample_page_1055638_start.html")
        self.save_current_page(driver,
                               "./samples/sample_page_1055638_start.png")
        """
		<div ng-if="!vm.loading &amp;&amp; !vm.details.paperCitations.ieee &amp;&amp; !vm.details.paperCitations.nonIeee &amp;&amp; !vm.details.patentCitations" class="ng-scope" style="">
		Citations are not available for this document.
	</div>
		"""
        #el = driver.find_element_by_xpath('//div[@ng-if="!vm.loading &amp;&amp; !vm.details.paperCitations.ieee &amp;&amp; !vm.details.paperCitations.nonIeee &amp;&amp; !vm.details.patentCitations"')
        #els = driver.find_elements_by_xpath('//div[@class="ng-scope" and @style=""]') #ok. got els
        #els = driver.find_elements_by_xpath('//div[@ng-if="::!vm.contextData.paperCitations.ieee &amp;&amp; !vm.contextData.paperCitations.nonIeee &amp;&amp; !vm.contextData.patentCitations"]') #0
        #><div ng-if="::!vm.contextData.paperCitations.ieee &amp;&amp; !vm.contextData.paperCitations.nonIeee &amp;&amp; !vm.contextData.patentCitations" class="ng-scope">
        #els = driver.find_elements_by_xpath('//div[@ng-if="::!vm.contextData.paperCitations.ieee"]') #0
        try:
            div = driver.find_element_by_css_selector(
                'div > section[class="document-all-references ng-scope"] > div[class="ng-scope"] > div[class="strong"]'
            ).text
            if div == "Citations not available for this document.":
                self.log.debug("this paper not cited. return []")
                return citeds_str, cited_papers, cited_urls
            self.log.debug("div=" + div + ", this paper is cited")
        except NoSuchElementException:
            self.log.debug("this paper is cited")
        """
		try:
			driver.find_element_by_name('queryText').send_keys(keywords)
			driver.find_element_by_class_name('Search-submit').click()
		except(Exception) as e:
			self.log.exception('[[EXCEPTON OCCURED]]: %s', e)
			sys.exit("[[EXCEPTON OCCURED]]please check logfile.")
			
		document-banner-metric ng-scope
		ui-sref="document.full({tab:'citations', q:null, ctx:null, section:null, part:null, anchor:null, tabFilter: 'papers'})"
		#self.save_current_page(driver, "./samples/sample_page2.html")
		self.save_current_page(driver, "./samples/sample_page2.png")
	<button class="load-more-button" type="button" ng-click="vm.loadMoreCitations('patent')" ng-disabled="vm.loading" tabindex="0" aria-disabled="false">
				<span ng-show="!vm.loading" aria-hidden="false" class="">View More</span>
				<i class="fa fa-spinner fa-spin ng-hide" ng-show="vm.loading" aria-hidden="true"></i>
		"""
        self.log.debug(
            "WebDriverWait(driver, timeout).until(lambda driver: driver.find_element_by_xpath('//b[@class=ng-binding]' start"
        )

        try:
            WebDriverWait(
                driver,
                timeout).until(lambda driver: driver.find_element_by_xpath(
                    '//b[@class="ng-binding"]'))
        except TimeoutException:
            m = "caught TimeoutException at load the first cited page."
            print(m)
            self.log.warning(m)
            driver.get(initial_url)
            return citeds_str, cited_papers, cited_urls
        except NoSuchElementException:
            m = "caught NoSuchElementException at load the first cited page."
            print(m)
            self.log.warning(m)
            driver.get(initial_url)
            return citeds_str, cited_papers, cited_urls

        self.log.debug("Wait Finished.")

        #self.save_current_page(driver, "./samples/sample_page_1055638_cited.html")
        #self.save_current_page(driver, "./samples/sample_page_1055638_cited.png")

        elements = driver.find_elements_by_css_selector(
            'div[ng-repeat="item in vm.contextData.paperCitations.ieee"] > div[class="pure-g pushTop10"] > div[class="pure-u-23-24"]'
        )
        num_of_viewing = len(elements)
        limit_of_view = self.conf.getconf("IEEE_citation_num_at_first_page")
        print("num_of_viewing[" + str(num_of_viewing) + "], limit_of_view[" +
              str(limit_of_view) + "]")

        print("continue pushing more view button")
        ##if not cited, load-more-button does not exist.
        ##but if cited, load-more-button always exists nevertheless no more paper,
        ##and the buttons are hidden.
        while num_of_viewing > limit_of_view - 10:
            limit_of_view += self.conf.getconf(
                "IEEE_citation_num_per_more_view")
            load_more_button = driver.find_element_by_xpath(
                '//button[@class="load-more-button" and @ng-click="vm.loadMoreCitations(\'ieee\')"]'
            )
            load_more_button.click()
            try:
                WebDriverWait(
                    driver, timeout
                ).until(lambda driver: driver.find_element_by_xpath(
                    '//button[@class="load-more-button" and @ng-click="vm.loadMoreCitations(\'ieee\')" and @aria-disabled="false"]'
                ))
            except TimeoutException:
                m = "caught TimeoutException at loading more cited pages(" + str(
                    limit_of_view) + ") paper[" + driver.current_url + "]."
                print(m)
                self.log.warning(m)
            except NoSuchElementException:
                m = "caught NoSuchElementException at loading more cited pages(" + str(
                    limit_of_view) + ") paper[" + driver.current_url + "]."
                print(m)
                self.log.warning(m)
            #elements = driver.find_elements_by_css_selector('div[ng-repeat="item in vm.contextData.paperCitations.ieee"] > div > div > b[class="ng-binding"]')
            elements = driver.find_elements_by_css_selector(
                'div[ng-repeat="item in vm.contextData.paperCitations.ieee"] > div[class="pure-g pushTop10"] > div[class="pure-u-23-24"]'
            )
            #self.save_current_page(driver, "./samples/sample_page_1055638_cited_"+str(limit_of_view)+".html")
            #self.save_current_page(driver, "./samples/sample_page_1055638_cited_"+str(limit_of_view)+".png")
            num_of_viewing = len(elements)
            print("num_of_viewing[" + str(num_of_viewing) +
                  "], limit_of_view[" + str(limit_of_view) + "]")

        print("cited loop finished. num_of_viewing[" + str(num_of_viewing) +
              "], limit_of_view[" + str(limit_of_view) + "]")

        print("create arrays of paper and url")

        for el in elements:
            cited_url = self.conf.getconf(
                "IEEE_website"
            ) + el.find_element_by_css_selector(
                'div[class="ref-links-container stats-citations-links-container"] > span > a'
            ).get_attribute("ng-href")
            cited_urls.append(cited_url)
            cited_authors, cited_title, cited_conference, cited_date = self.parse_citing(
                el.find_element_by_css_selector(
                    'div[ng-bind-html="::item.displayText"]').text)
            import time
            timestamp = time.strftime('%Y-%m-%d %H:%M:%S')
            cited_paper = table_papers.Table_papers(
                title=cited_title,
                authors=cited_authors,
                conference=cited_conference,
                published=cited_date,
                url=cited_url,
                timestamp=timestamp)
            print(cited_paper.get_vars())
            cited_paper.renewal_insert()

        #self.save_current_page(driver, "./samples/sample_page_1055638_cited_view_more.html")
        #self.save_current_page(driver, "./samples/sample_page_1055638_cited_view_more.png")

        driver.get(initial_url)
        self.log.debug(__class__.__name__ + "." +
                       sys._getframe().f_code.co_name + " finished")
        return citeds_str, cited_papers, cited_urls

    def get_conference(self, driver):
        conference = driver.find_element_by_xpath('//div[@class="u-pb-1 stats-document-abstract-doi ng-scope"]')\
             .find_element_by_tag_name('a').text

        return conference

    def get_date_of_publication(self, driver):
        #Date of Publication: 06 January 200 or Date of Conference 14-16 Nov. 2006
        try:
            date = driver.find_element_by_xpath(
                '//div[@ng-if="::vm.details.isConference == true"]').text
        except NoSuchElementException:
            self.log.debug(
                "catch NoSuchElementException. date = ''")  ##todo paper
            date = ""
        return self.convert_to_datetime(date)

    def download_a_paper(self, driver, path="../../data/tmp/"):
        self.log.debug(__class__.__name__ + "." +
                       sys._getframe().f_code.co_name + " start")
        initial_url = driver.current_url
        button = driver.find_element_by_css_selector(
            'i[class="icon doc-act-icon-pdf]')
        button.click()
        self.save_current_page(driver,
                               "./samples/sample_page_7849067_pdf_click.html")
        self.save_current_page(driver,
                               "./samples/sample_page_7849067_pdf_click.png")
        driver.get(initial_url)

        self.log.debug(__class__.__name__ + "." +
                       sys._getframe().f_code.co_name + " finished")

    def download_papers_by_keywords(self, driver, path, download_num=25):
        # 0:デスクトップ、1:システム規定のフォルファ、2:ユーザ定義フォルダ
        #driver.setPreference("browser.download.folderList",2)
        # 上記で2を選択したのでファイルのダウンロード場所を指定
        #driver.setPreference("browser.download.dir", path)
        # ダウンロード完了時にダウンロードマネージャウィンドウを表示するかどうかを示す真偽値。
        #driver.setPreference("browser.download.manager.showWhenStarting",False)
        links = driver.find_elements_by_class_name("pure-u-22-24")
        self.log.debug("len(links)[" + str(len(links)) + "]")
        i = 0
        for link in links:
            self.log.debug("txt:" + link.text)
            element = link.find_element_by_css_selector("h2 > a")
            pdf_title = element.text
            self.log.debug("pdf_title:" + pdf_title)
            pdf_url = self.convert_path_to_url(element.get_attribute("href"))
            self.log.debug("pdf_dir:" + pdf_url)

            element = link.find_element_by_css_selector("p")
            pdf_authors = link.find_element_by_css_selector("p").text.split(
                "; ")
            self.log.debug("pdf_author:" + str(pdf_authors))

            print("pdf_title:" + pdf_title)
            print("pdf_dir:" + pdf_url)
            print("pdf_author:" + str(pdf_authors))

            i += 1
            if i >= download_num:
                self.log.debug("i>=" + str(download_num) + "." +
                               __class__.__name__ + "." +
                               sys._getframe().f_code.co_name + " finished.")
                return 0
        self.log.debug("len(link)<" + str(download_num) + "." +
                       __class__.__name__ + "." +
                       sys._getframe().f_code.co_name + " finished.")
        return 0

    """	
	def get_papers_with_breadth_first_search(self, root_url_of_paper):
		
		import math
		math.breadth_first_search(root_url_of_paper, get_citing_papers() )

		self.log.debug(__class__.__name__ + "." + sys._getframe().f_code.co_name + " start")
		self.log.debug("root_url_of_paper["+root_url_of_paper+"]")
		
		citing_urls, cited_urls = ***
		
		for url in citing_urls:
			self.get_papers_with_breadth_first_search(url)

		self.log.debug(__class__.__name__ + "." + sys._getframe().f_code.co_name + " finished")
	"""

    def convert_to_datetime(self, str):
        self.log.warning("!!!incomplete method[" + __class__.__name__ + "." +
                         sys._getframe().f_code.co_name + "]!!!")
        import time
        timestamp = time.strftime('%Y-%m-%d %H:%M:%S')
        return timestamp

    def convert_paper_url_to_cited_url(self, url):
        #from
        #http://ieeexplore.ieee.org/document/4116687/?reload=true
        #to
        #http://ieeexplore.ieee.org/document/4116687/citations?anchor=anchor-paper-citations-ieee&ctx=citations
        return url.split("?")[
            0] + "citations?anchor=anchor-paper-citations-ieee&ctx=citations"

    def parse_citing(self, str):
        self.log.debug(__class__.__name__ + "." +
                       sys._getframe().f_code.co_name + " start")
        self.log.debug("src_srt[" + str + "]")
        #from
        #Daniel Garant, Wei Lu, "Mining Botnet Behaviors on the Large-Scale Web Application Community", Advanced Information Networking and Applications Workshops (WAINA) 2013 27th International Conference on, pp. 185-190, 2013.
        #to
        #Daniel Garant, Wei Lu,
        #Mining Botnet Behaviors on the Large-Scale Web Application Community
        #Advanced Information Networking and Applications Workshops (WAINA) 2013 27th International Conference on
        #pp. 185-190, 2013
        array = str.split("\"")
        if len(array) < 3:
            self.log.warning(__class__.__name__ + "." +
                             sys._getframe().f_code.co_name + " warning")
            self.log.warning("str[" + str + "]")
            self.log.warning("len(array)(" + str(len(array)) +
                             ") < 3. return \"\", \"\", \"\", \"\"")
            return "", "", "", ""

        authors = array[0]
        title = array[1]
        new_array = array[2][1:].split(",")
        print(len(new_array))
        if len(new_array) < 3:
            self.log.warning(__class__.__name__ + "." +
                             sys._getframe().f_code.co_name + " warning")
            self.log.warning("str[" + str + "]")
            self.log.warning("len(new_array)(" + str(len(new_array)) +
                             ") < 3. return authors, title, \"\", \"\"")
            return authors, title, "", ""

        elif len(new_array) == 3:
            conference, page, year = new_array
        elif len(new_array) == 4:
            conference, vol, page, year = new_array
        elif len(new_array) == 5:
            conference, vol, page, year, issn = new_array
        else:
            self.log.warning(__class__.__name__ + "." +
                             sys._getframe().f_code.co_name + " warning")
            self.log.warning("str[" + str + "]")
            self.log.warning("len(new_array)(" + str(len(new_array)) +
                             ") > 5. return authors, title, \"\", \"\"")
            return authors, title, "", ""

        return authors, title, conference, year

    ## for debug
    def print_h2_attributes(self, driver):
        self.log.debug(__class__.__name__ + "." +
                       sys._getframe().f_code.co_name + " start")
        links = driver.find_elements_by_tag_name("h2")
        for link in links:
            print(link.text)
        self.log.debug(__class__.__name__ + "." +
                       sys._getframe().f_code.co_name + " finished")

    def save_current_page(self, driver, filename):
        self.log.debug(__class__.__name__ + "." +
                       sys._getframe().f_code.co_name + " start")
        path, suffix = os.path.splitext(filename)
        self.log.debug("path[" + path + "], suffix[" + suffix + "]")
        if suffix == ".html":
            f = open(filename, 'w')
            f.write(driver.page_source)
            f.close()
        elif suffix == ".png":
            driver.save_screenshot(filename)
        else:
            self.log.error("TYPEERROR suffix[" + suffix + "]")
        self.log.debug(__class__.__name__ + "." +
                       sys._getframe().f_code.co_name + " finished")

    def show_options(self):
        self.log.debug(__class__.__name__ + "." +
                       sys._getframe().f_code.co_name + " start")
        self.opt.show_options()
        self.log.debug(__class__.__name__ + "." +
                       sys._getframe().f_code.co_name + " finished")
Exemple #15
0
	def test_var(self):
		from conf import Conf
		print("IEEE_website["+Conf.getconf("IEEE_website")+"]")
		print("IEEE_top_page["+Conf.getconf("IEEE_top_page")+"]")
		paper_url = Conf.getconf("IEEE_website") + "/document/6550394"
		print("paper_url[" + paper_url + "]")
Exemple #16
0
	def test_conf(self):
		from conf import Conf
		print("loglevel["+Conf.getconf("loglevel")+"]")
import datetime

sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/../lib/utils")
from conf import Conf
from log import Log

sys.path.append(
    os.path.dirname(os.path.abspath(__file__)) + "/../src/scraping")
from kakaku import Kakaku

log = Log.getLogger()
kakaku = Kakaku()

args = sys.argv[1:]
print("products: " + str(args))
log_dir = Conf.getconf("product_log_dir")
for arg in args:
    log.info("target product name[" + str(arg) + "]")
    product_name = arg
    log_name = str(datetime.datetime.now().strftime("%Y%m%d_%H%M%S") + "_" +
                   re.sub(" |/", "_", product_name) + ".log")
    #log_name = "product.log"
    #product_log = Log.getLogger(logfile=log_dir+log_name)
    product_log = log
    print("product: " + product_name + ", save log to: " + log_dir + log_name)
    try:
        kakaku.save_cheapest_pdf(product_name, logger=product_log)
    except Exception as e:
        print("Faild. caught " + e.__class__.__name__ +
              " exception. Please retry [" + product_name + "]")
        print(e)