Esempio n. 1
0
    def parse(self, response):

        author_items = []
        for author in response.xpath('//*[contains(@class, "authorName")]'):
            author_item = GoogleAuthorsItem()
            author_item['publication_id'] = response.meta['publication_id']
            author_item['article_id'] = response.meta['article_id']
            author_item['affiliation_id'] = ''
            author_item['fullname'] = DataFilter.simple_format(
                author.xpath('.').extract())
            author_item['create_time'] = mysql_datetime()
            author_items.append(author_item)

        affiliation_items = []
        for affiliation in response.xpath(
                '//*[contains(@class, "affiliation")]'):
            affiliation_item = GoogleAffiliationItem()
            affiliation_item['publication_id'] = response.meta[
                'publication_id']
            affiliation_item['article_id'] = response.meta['article_id']
            affiliation_item['desc'] = DataFilter.simple_format(
                affiliation.xpath('.').extract())
            affiliation_item['create_time'] = mysql_datetime()
            affiliation_items.append(affiliation_item)

        MYSQLUtils.save(self, "google_authors", author_items)
        MYSQLUtils.save(self, "google_affiliations", affiliation_items)
 def parse_cate2(self, response):
     cate1_id = response.meta['cate1_id']
     items = []
     for a in response.xpath('//*[@id="gs_m_rbs"]/descendant::a'):
         item = GoogleCategoryItem()
         item['fid'] = cate1_id
         item['name'] = DataFilter.simple_format(a.xpath('.').extract())
         item['cate_url'] = "%s%s" % (self.domain,
                                      DataFilter.simple_format(
                                          a.xpath("./@href").extract()))
         item['create_time'] = mysql_datetime()
         items.append(item)
     MYSQLUtils.save(self, "google_category", items)
Esempio n. 3
0
 def parse(self, response):
     # print response.body
     # return
     for row in response.xpath(
             '//*[@id="gs_cit_list_table"]/tr[position()>1]'):
         item = GooglePublicationItem()
         item['cate1_id'] = response.meta['cate1_id']
         item['cate2_id'] = response.meta['cate2_id']
         item['name'] = DataFilter.simple_format(
             row.xpath('td[position()=2]').extract())
         item['desc'] = ''
         item['h5_idx'] = DataFilter.simple_format(
             row.xpath('td[position()=3]').extract())
         item['h5_med'] = DataFilter.simple_format(
             row.xpath('td[position()=4]').extract())
         item['rank'] = DataFilter.simple_format(
             row.xpath('td[position()=1]').extract())
         item['create_time'] = mysql_datetime()
         article_list_url = "%s%s" % (
             self.domain,
             DataFilter.simple_format(
                 row.xpath('td[position()=3]/a/@href').extract()))
         publication_id = MYSQLUtils.save(self, "google_publication",
                                          item)[0]
         response.meta['publication_id'] = publication_id
         response.meta['h5_idx'] = item['h5_idx']
         yield Request(article_list_url,
                       callback=self.parse_article_list,
                       meta=response.meta)
    def parse_staff_profile(self, response):
        cb_id = response.meta['cb_id']
        summary = response.xpath(
            '//div[contains(@class, "field-type-text-with-summary")]')

        ce_items = self.parse_candidate_education_item(summary, cb_id)
        MYSQLUtils.save(self, "candidate_education", ce_items)

        cr_items = self.parse_candidate_research_item(summary, cb_id)
        MYSQLUtils.save(self, "candidate_research", cr_items)
        #
        cp_items = self.parse_candidate_publications_item(summary, cb_id)
        MYSQLUtils.save(self, "candidate_publications", cp_items)
        #
        cc_items = self.parse_candidate_courses_item(summary, cb_id)
        MYSQLUtils.save(self, "candidate_courses", cc_items)
        #
        cw_items = self.parse_candidate_workexperience_item(summary, cb_id)
        MYSQLUtils.save(self, "candidate_workexperience", cw_items)
        pass
Esempio n. 5
0
 def insert_article(self, response):
     article_link = response.url
     content = DataFilter.simple_format(
         response.xpath('//meta[@http-equiv="refresh"]/@content').extract())
     article_link_match = re.search(self.article_link_pattern, content)
     if article_link_match:
         article_link = article_link_match.group(1)
     item = GoogleArticlesItem()
     for key in MYSQLUtils.get_columns_by_item(item):
         item[key] = response.meta[key]
     item['article_link'] = article_link
     article_id = MYSQLUtils.save(self, "google_articles", item)[0]
     response.meta['article_id'] = article_id
 def parse(self, response):
     for a in response.xpath('//*[@id="gs_m_broad"]/descendant::a'):
         cate1_url = "%s%s" % (self.domain,
                               DataFilter.simple_format(
                                   a.xpath("./@href").extract()))
         cate1_name = DataFilter.simple_format(a.xpath('.').extract())
         item = GoogleCategoryItem()
         item['fid'] = 0
         item['name'] = cate1_name
         item['cate_url'] = cate1_url
         item['create_time'] = mysql_datetime()
         cate1_id = MYSQLUtils.save(self, "google_category", item)[0]
         yield Request(cate1_url,
                       callback=self.parse_cate2,
                       meta={"cate1_id": cate1_id})
 def parse(self, response):
     # return
     i = 0
     for staff in response.xpath(
             '//div[contains(@class, "staff-contact")]'):
         cb_items = self.parse_candidate_basic_item(staff)
         cb_id = MYSQLUtils.save(self, "candidate_basic", cb_items)[0]
         staff_profile_url = self.parse_staff_profile_url(staff)
         if staff_profile_url:
             print staff_profile_url
             yield Request(staff_profile_url,
                           callback=self.parse_staff_profile,
                           meta={"cb_id": cb_id})
             pass
             print cb_id
 def parse_item(self, response):
     cb_item = self.parse_candidate_basic_item(response)
     cb_id = MYSQLUtils.save(self, 'candidate_basic', cb_item)[0]
     ce_items = self.parse_candidate_education_item(response, cb_id)
     MYSQLUtils.save(self, 'candidate_education', ce_items)
     cr_items = self.parse_candidate_research_item(response, cb_id)
     MYSQLUtils.save(self, 'candidate_research', cr_items)
     cp_items = self.parse_candidate_publications_item(response, cb_id)
     MYSQLUtils.save(self, 'candidate_publications', cp_items)
     cc_items = self.parse_candidate_courses_item(response, cb_id)
     MYSQLUtils.save(self, 'candidate_courses', cc_items)
     cw_items = self.parse_candidate_workexperience_item(response, cb_id)
     MYSQLUtils.save(self, 'candidate_workexperience', cw_items)
Esempio n. 9
0
    def parse_item(self, response):
        pass
        # print response.body
        # cb_item = self.parse_candidate_basic_item(response)
        # cb_id = MYSQLUtils.save(self, "candidate_basic", cb_item)[0]
        # # print cb_id
        # ce_items = self.parse_candidate_education_item(response, cb_id)
        # MYSQLUtils.save(self, "candidate_education", ce_items)
        #
        # cr_items = self.parse_candidate_research_item(response, cb_id)
        # MYSQLUtils.save(self, "candidate_research", cr_items)
        #
        # cp_items = self.parse_candidate_publications_item(response, cb_id)
        # MYSQLUtils.save(self, "candidate_publications", cp_items)
        #
        # cc_items = self.parse_candidate_courses_item(response, cb_id)
        # MYSQLUtils.save(self, "candidate_courses", cc_items)
        #
        # cw_items = self.parse_candidate_workexperience_item(response, cb_id)
        # MYSQLUtils.save(self, "candidate_workexperience", cw_items)

        cb_item = self.parse_candidate_basic_item(response)
        if self.fmt == "mysql":
            cb_id = MYSQLUtils.save(self, "candidate_basic", cb_item)[0]
        else:
            cb_id = self.fh.generate_id(cb_item['fullname'] + cb_item['url'])
        ce_items = self.parse_candidate_education_item(response, cb_id)
        cr_items = self.parse_candidate_research_item(response, cb_id)
        cp_items = self.parse_candidate_publications_item(response, cb_id)
        cc_items = self.parse_candidate_courses_item(response, cb_id)
        cw_items = self.parse_candidate_workexperience_item(response, cb_id)

        if self.fmt == "mysql":
            MYSQLUtils.save(self, "candidate_education", ce_items)
            MYSQLUtils.save(self, "candidate_research", cr_items)
            MYSQLUtils.save(self, "candidate_publications", cp_items)
            MYSQLUtils.save(self, "candidate_courses", cc_items)
            MYSQLUtils.save(self, "candidate_workexperience", cw_items)
        else:
            self.fh.data['candidate_basic']['item'] = cb_item
            self.fh.data['candidate_education']['item'] = ce_items
            self.fh.data['candidate_research']['item'] = cr_items
            self.fh.data['candidate_publications']['item'] = cp_items
            self.fh.data['candidate_courses']['item'] = cc_items
            self.fh.data['candidate_workexperience']['item'] = cw_items
            self.fh.write(self.fmt)