Example #1
0
    def parse(self, response):

        author_items = []
        for author in response.xpath('//*[contains(@class, "authorName")]'):
            author_item = GoogleAuthorsItem()
            author_item['publication_id'] = response.meta['publication_id']
            author_item['article_id'] = response.meta['article_id']
            author_item['affiliation_id'] = ''
            author_item['fullname'] = DataFilter.simple_format(
                author.xpath('.').extract())
            author_item['create_time'] = mysql_datetime()
            author_items.append(author_item)

        affiliation_items = []
        for affiliation in response.xpath(
                '//*[contains(@class, "affiliation")]'):
            affiliation_item = GoogleAffiliationItem()
            affiliation_item['publication_id'] = response.meta[
                'publication_id']
            affiliation_item['article_id'] = response.meta['article_id']
            affiliation_item['desc'] = DataFilter.simple_format(
                affiliation.xpath('.').extract())
            affiliation_item['create_time'] = mysql_datetime()
            affiliation_items.append(affiliation_item)

        MYSQLUtils.save(self, "google_authors", author_items)
        MYSQLUtils.save(self, "google_affiliations", affiliation_items)
Example #2
0
 def __init__(self, mode=None, **kwargs):
     self.db = mysql_connection
     self.mode = mode
     if mode == "init":
         os.system("scrapy crawl GoogleScholarCategorySpider")
     MYSQLUtils.cleanup_google_publication_articles(self)
     super(GoogleScholarSpider, self).__init__(**kwargs)
     pass
Example #3
0
    def read(self):
        self.data['courses'] = MYSQLUtils.fetch_courses_data(self.db)
        self.data['education'] = MYSQLUtils.fetch_education_data(self.db)
        self.data['publications'] = MYSQLUtils.fetch_publications_data(self.db)
        self.data['research'] = MYSQLUtils.fetch_research_data(self.db)
        self.data['workexperience'] = MYSQLUtils.fetch_workexperience_data(
            self.db)

        return self
Example #4
0
 def __init__(self, fmt="mysql", **kwargs):
     self.fmt = fmt
     if fmt == "mysql":
         self.db = mysql_connection
         MYSQLUtils.cleanup_data(self)
     else:
         self.fh = FileHandler()
         self.fh.cleanup_data(self, fmt)
     super(StanfordSpider, self).__init__(**kwargs)
     pass
 def parse_cate2(self, response):
     cate1_id = response.meta['cate1_id']
     items = []
     for a in response.xpath('//*[@id="gs_m_rbs"]/descendant::a'):
         item = GoogleCategoryItem()
         item['fid'] = cate1_id
         item['name'] = DataFilter.simple_format(a.xpath('.').extract())
         item['cate_url'] = "%s%s" % (self.domain,
                                      DataFilter.simple_format(
                                          a.xpath("./@href").extract()))
         item['create_time'] = mysql_datetime()
         items.append(item)
     MYSQLUtils.save(self, "google_category", items)
Example #6
0
 def insert_article(self, response):
     article_link = response.url
     content = DataFilter.simple_format(
         response.xpath('//meta[@http-equiv="refresh"]/@content').extract())
     article_link_match = re.search(self.article_link_pattern, content)
     if article_link_match:
         article_link = article_link_match.group(1)
     item = GoogleArticlesItem()
     for key in MYSQLUtils.get_columns_by_item(item):
         item[key] = response.meta[key]
     item['article_link'] = article_link
     article_id = MYSQLUtils.save(self, "google_articles", item)[0]
     response.meta['article_id'] = article_id
Example #7
0
 def parse(self, response):
     # print response.body
     # return
     for row in response.xpath(
             '//*[@id="gs_cit_list_table"]/tr[position()>1]'):
         item = GooglePublicationItem()
         item['cate1_id'] = response.meta['cate1_id']
         item['cate2_id'] = response.meta['cate2_id']
         item['name'] = DataFilter.simple_format(
             row.xpath('td[position()=2]').extract())
         item['desc'] = ''
         item['h5_idx'] = DataFilter.simple_format(
             row.xpath('td[position()=3]').extract())
         item['h5_med'] = DataFilter.simple_format(
             row.xpath('td[position()=4]').extract())
         item['rank'] = DataFilter.simple_format(
             row.xpath('td[position()=1]').extract())
         item['create_time'] = mysql_datetime()
         article_list_url = "%s%s" % (
             self.domain,
             DataFilter.simple_format(
                 row.xpath('td[position()=3]/a/@href').extract()))
         publication_id = MYSQLUtils.save(self, "google_publication",
                                          item)[0]
         response.meta['publication_id'] = publication_id
         response.meta['h5_idx'] = item['h5_idx']
         yield Request(article_list_url,
                       callback=self.parse_article_list,
                       meta=response.meta)
Example #8
0
 def start_requests(self):
     cate_list = MYSQLUtils.fetch_cate_list(self)
     # yield Request("http://www.baidu.com", callback=self.parse)
     for cate in cate_list:
         meta = {"cate1_id": cate['fid'], "cate2_id": cate['cate_id']}
         cate_url = get_google_spider_url(cate['cate_url'])
         yield Request(cate_url, callback=self.parse, meta=meta)
    def parse_staff_profile(self, response):
        cb_id = response.meta['cb_id']
        summary = response.xpath(
            '//div[contains(@class, "field-type-text-with-summary")]')

        ce_items = self.parse_candidate_education_item(summary, cb_id)
        MYSQLUtils.save(self, "candidate_education", ce_items)

        cr_items = self.parse_candidate_research_item(summary, cb_id)
        MYSQLUtils.save(self, "candidate_research", cr_items)
        #
        cp_items = self.parse_candidate_publications_item(summary, cb_id)
        MYSQLUtils.save(self, "candidate_publications", cp_items)
        #
        cc_items = self.parse_candidate_courses_item(summary, cb_id)
        MYSQLUtils.save(self, "candidate_courses", cc_items)
        #
        cw_items = self.parse_candidate_workexperience_item(summary, cb_id)
        MYSQLUtils.save(self, "candidate_workexperience", cw_items)
        pass
Example #10
0
 def start_requests(self):
     article_list = MYSQLUtils.fetch_article_list(self, self.domain)
     # print article_list[0]
     for article in article_list:
         meta = {}
         url = article['article_link']
         for key in article.keys():
             meta[key] = article[key]
         headers = {
             'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.95 Safari/537.36'
         }
         yield Request(url, callback=self.parse, meta=meta, headers=headers)
 def parse(self, response):
     for a in response.xpath('//*[@id="gs_m_broad"]/descendant::a'):
         cate1_url = "%s%s" % (self.domain,
                               DataFilter.simple_format(
                                   a.xpath("./@href").extract()))
         cate1_name = DataFilter.simple_format(a.xpath('.').extract())
         item = GoogleCategoryItem()
         item['fid'] = 0
         item['name'] = cate1_name
         item['cate_url'] = cate1_url
         item['create_time'] = mysql_datetime()
         cate1_id = MYSQLUtils.save(self, "google_category", item)[0]
         yield Request(cate1_url,
                       callback=self.parse_cate2,
                       meta={"cate1_id": cate1_id})
 def parse(self, response):
     # return
     i = 0
     for staff in response.xpath(
             '//div[contains(@class, "staff-contact")]'):
         cb_items = self.parse_candidate_basic_item(staff)
         cb_id = MYSQLUtils.save(self, "candidate_basic", cb_items)[0]
         staff_profile_url = self.parse_staff_profile_url(staff)
         if staff_profile_url:
             print staff_profile_url
             yield Request(staff_profile_url,
                           callback=self.parse_staff_profile,
                           meta={"cb_id": cb_id})
             pass
             print cb_id
Example #13
0
 def __init__(self, **kwargs):
     self.db = mysql_connection
     MYSQLUtils.cleanup_data(self)
     super(StanfordSpider, self).__init__(**kwargs)
     pass
 def __init__(self, **kwargs):
     self.db = mysql_connection
     MYSQLUtils.cleanup_data(self)
     super(ComputerScienceOfWaterLooSpider, self).__init__(**kwargs)
     pass
 def __init__(self, **kwargs):
     self.db = mysql_connection
     MYSQLUtils.cleanup_google_category(self)
     super(GoogleScholarCategorySpider, self).__init__(**kwargs)
     pass
 def parse_item(self, response):
     cb_item = self.parse_candidate_basic_item(response)
     cb_id = MYSQLUtils.save(self, 'candidate_basic', cb_item)[0]
     ce_items = self.parse_candidate_education_item(response, cb_id)
     MYSQLUtils.save(self, 'candidate_education', ce_items)
     cr_items = self.parse_candidate_research_item(response, cb_id)
     MYSQLUtils.save(self, 'candidate_research', cr_items)
     cp_items = self.parse_candidate_publications_item(response, cb_id)
     MYSQLUtils.save(self, 'candidate_publications', cp_items)
     cc_items = self.parse_candidate_courses_item(response, cb_id)
     MYSQLUtils.save(self, 'candidate_courses', cc_items)
     cw_items = self.parse_candidate_workexperience_item(response, cb_id)
     MYSQLUtils.save(self, 'candidate_workexperience', cw_items)
Example #17
0
    def parse_item(self, response):
        pass
        # print response.body
        # cb_item = self.parse_candidate_basic_item(response)
        # cb_id = MYSQLUtils.save(self, "candidate_basic", cb_item)[0]
        # # print cb_id
        # ce_items = self.parse_candidate_education_item(response, cb_id)
        # MYSQLUtils.save(self, "candidate_education", ce_items)
        #
        # cr_items = self.parse_candidate_research_item(response, cb_id)
        # MYSQLUtils.save(self, "candidate_research", cr_items)
        #
        # cp_items = self.parse_candidate_publications_item(response, cb_id)
        # MYSQLUtils.save(self, "candidate_publications", cp_items)
        #
        # cc_items = self.parse_candidate_courses_item(response, cb_id)
        # MYSQLUtils.save(self, "candidate_courses", cc_items)
        #
        # cw_items = self.parse_candidate_workexperience_item(response, cb_id)
        # MYSQLUtils.save(self, "candidate_workexperience", cw_items)

        cb_item = self.parse_candidate_basic_item(response)
        if self.fmt == "mysql":
            cb_id = MYSQLUtils.save(self, "candidate_basic", cb_item)[0]
        else:
            cb_id = self.fh.generate_id(cb_item['fullname'] + cb_item['url'])
        ce_items = self.parse_candidate_education_item(response, cb_id)
        cr_items = self.parse_candidate_research_item(response, cb_id)
        cp_items = self.parse_candidate_publications_item(response, cb_id)
        cc_items = self.parse_candidate_courses_item(response, cb_id)
        cw_items = self.parse_candidate_workexperience_item(response, cb_id)

        if self.fmt == "mysql":
            MYSQLUtils.save(self, "candidate_education", ce_items)
            MYSQLUtils.save(self, "candidate_research", cr_items)
            MYSQLUtils.save(self, "candidate_publications", cp_items)
            MYSQLUtils.save(self, "candidate_courses", cc_items)
            MYSQLUtils.save(self, "candidate_workexperience", cw_items)
        else:
            self.fh.data['candidate_basic']['item'] = cb_item
            self.fh.data['candidate_education']['item'] = ce_items
            self.fh.data['candidate_research']['item'] = cr_items
            self.fh.data['candidate_publications']['item'] = cp_items
            self.fh.data['candidate_courses']['item'] = cc_items
            self.fh.data['candidate_workexperience']['item'] = cw_items
            self.fh.write(self.fmt)
Example #18
0
 def __init__(self, mode=None, **kwargs):
     self.db = mysql_connection
     MYSQLUtils.cleanup_google_author_affiliations(self, self.domain)
     super(ScienceDirectSpider, self).__init__(**kwargs)
     pass