Esempio n. 1
0
    def parse(self, response):

        print response.xpath('//h1/text()').extract()[0]

        for record in response.xpath('//p'):
            print record
            item = ConferenceItem()
            item['year'] = '2012'
            print record.xpath('text()')
            item['title'] = record.xpath('following::div[1]/text()').extract()[0].strip()

            authors = record.xpath('following::div[2]/text()').extract()
            authors = ''.join(authors).split('\n')[1].split(', ')
            item['authors'] = authors

            yield item
Esempio n. 2
0
    def parse(self, response):

        for record in response.xpath('//tr//div[@class=\'cp_pp\']'):

            item = ConferenceItem()

            item['year'] = '2006'
            title = record.xpath('a/text()').extract()
            print title
            item['title'] = title[0]
            authors = record.xpath('a/@title').extract()
            print authors
            authors = authors[0].split(', ')
            item['authors'] = authors

            yield item
Esempio n. 3
0
    def parse(self, response):

        for record in response.xpath(
                '//div[@class=\"span12\" and position()=3]//p[position()>1]'):
            print record
            item = ConferenceItem()
            item['year'] = "2006"
            title = record.xpath('strong/a/text()').extract()
            print title
            item['title'] = title[0].strip().lstrip()
            authors = []
            for path in record.xpath('strong/following-sibling::a'):
                authors.append(path.xpath('text()').extract()[0])
            print authors
            item['authors'] = authors
            yield item
Esempio n. 4
0
    def parse(self, response):

        for record in response.xpath('//a[h3]'):

            item = ConferenceItem()
            print record
            item['year'] = "2011"
            title = record.xpath('h3/text()').extract()
            print title
            item['title'] = title[0].strip()

            authors = record.xpath('span[@class=\'name\']/text()').extract()[0]
            print authors
            authors = authors.split(', ')
            item['authors'] = authors

            yield item
Esempio n. 5
0
    def parse(self, response):

        for record in response.xpath('//p//a[following-sibling::em]'):

            item = ConferenceItem()
            print record
            item['year'] = "2009"
            title = record.xpath('text()').extract()
            print title
            item['title'] = title[0]

            authors = record.xpath('following-sibling::em/text()').extract()[0]
            print authors
            authors = authors.split(', ')
            item['authors'] = authors

            yield item
Esempio n. 6
0
    def parse(self, response):

        for record in response.xpath('//div[@class=\'page\']//a'):

            item = ConferenceItem()
            item['year'] = "2013"
            data = record.xpath('text()').extract()
            item['title'] = data[0].strip().lstrip()
            authors = record.xpath('following::table[1]//em/text()').extract()[0].strip().lstrip()
            if "and" in authors:
                authors = authors.split(', ')
                item['authors'] = authors[0:-1]
                item['authors'].append(authors[-1].split('and')[0].strip())
                item['authors'].append(authors[-1].split('and')[1].lstrip())
            else:
                item['authors'] = [authors]
            yield item
Esempio n. 7
0
    def parse(self, response):

        for record in response.xpath('//div[@class=\'main\']//ul/li'):

            item = ConferenceItem()
            item['year'] = "2014"
            title = record.xpath('text()').extract()
            item['title'] = title[0].strip().lstrip()
            authors = record.xpath('i/text()').extract()[0].strip().lstrip()
            if "and" in authors:
                authors = authors.split(', ')
                item['authors'] = authors[0:-1]
                item['authors'].append(authors[-1].split('and')[0].strip())
                item['authors'].append(authors[-1].split('and')[1].lstrip())
            else:
                item['authors'] = [authors]
            yield item
Esempio n. 8
0
    def parse(self, response):

        for record in response.xpath('//section//strong'):

            item = ConferenceItem()
            item['year'] = '2015'
            title = record.xpath('text()').extract()
            item['title'] = title[0]
            authors = record.xpath('following-sibling::text()[1]').extract()
            print authors
            authors = authors[0].split('; ')
            au = []
            for a in authors:
                au.append(a.split(',')[0])
            item['authors'] = au

            yield item
Esempio n. 9
0
    def parse(self, response):

        for record in response.xpath('//body/div/ul//ul/li'):

            item = ConferenceItem()
            item['year'] = "2015"
            data = record.xpath('text()').extract()
            item['title'] = data[0].strip().lstrip()
            authors = data[1].strip().lstrip()
            if "and" in authors:
                authors = authors.split(', ')
                item['authors'] = authors[0:-1]
                item['authors'].append(authors[-1].split('and')[0].strip())
                item['authors'].append(authors[-1].split('and')[1].lstrip())
            else:
                item['authors'] = [authors]
            yield item
Esempio n. 10
0
    def parse(self, response):

        for record in response.xpath('//tr/td[a]'):

            item = ConferenceItem()
            print record
            item['year'] = "2008"
            title = record.xpath('a/text()').extract()
            print title
            item['title'] = title[0]

            authors = record.xpath('i/text()').extract()[0]
            print authors
            authors = authors.split(', ')
            item['authors'] = authors

            yield item
Esempio n. 11
0
    def parse(self, response):

        for record in response.xpath('//tr'):

            item = ConferenceItem()
            print record
            item['year'] = "2006"
            title = record.xpath('td[strong]/strong/text()').extract()
            print title
            item['title'] = title[0]

            authors = record.xpath('td[strong]/text()').extract()[0]
            print authors
            authors = authors.lstrip().strip().split(', ')
            item['authors'] = authors

            yield item
Esempio n. 12
0
    def parse(self, response):

        for record in response.xpath('//dt'):

            item = ConferenceItem()
            item['year'] = "2007"
            title = record.xpath('following::dd/text()').extract()
            item['title'] = title[0].strip().lstrip()
            authors = record.xpath('text()').extract()[0].strip().lstrip()
            if "and" in authors:
                authors = authors.split(', ')
                item['authors'] = authors[0:-1]
                item['authors'].append(authors[-1].split('and')[0].strip())
                item['authors'].append(authors[-1].split('and')[1].lstrip())
            else:
                item['authors'] = [authors]
            yield item
Esempio n. 13
0
    def parse(self, response):

        for record in response.xpath('//p[@class=\'left\']'):

            item = ConferenceItem()
            print record
            item['year'] = "2015"
            data = record.xpath('a/text()').extract()
            print data
            item['title'] = data[0].strip().lstrip()
            authors = record.xpath('i/text()').extract()[0]
            print authors
            authors = authors.split(', ')
            item['authors'] = authors[0:-1]
            item['authors'].append(authors[-1])

            yield item
Esempio n. 14
0
    def parse(self, response):
        i = 3
        for x in response.xpath('//table[@class=\'text12\']//tr'):

            record = response.xpath('//table[@class=\'text12\']//tr[position()=%d]'%i)
            if record.xpath('td[2]/span/a[not(@title)]'):
                data = record.xpath('td[2]/span/a[not(@title)]/text()').extract()
                item = ConferenceItem()
                item['year'] = '2006'
                item['title'] = data[0]
                print data
                authors = record.xpath('following-sibling::tr[1]//a/text()').extract()
                print authors
                item['authors'] = authors
                yield item
                i += 2
            else:
                i += 1
Esempio n. 15
0
    def parse(self, response):

        for record in response.xpath('//tr[not(@class)]'):

            item = ConferenceItem()
            item['year'] = "2008"
            title = record.xpath('td[2]/text()').extract()
            item['title'] = re.sub('\n ', '', title[0])
            authors = re.sub('\n ', '',
                             record.xpath('td[1]/text()').extract()[0])
            if "and" in authors:
                authors = authors.split(', ')
                item['authors'] = authors[0:-1]
                item['authors'].append(authors[-1].split('and')[0].strip())
                item['authors'].append(authors[-1].split('and')[1].lstrip())
            else:
                item['authors'] = [authors]
            yield item
Esempio n. 16
0
    def parse(self, response):

        for record in response.xpath('//dt'):

            item = ConferenceItem()
            print record
            item['year'] = "2014"
            title = record.xpath('text()').extract()
            print title
            item['title'] = title[0].strip()

            authors = record.xpath(
                'following-sibling::dd[1]/text()').extract()[0]
            print authors
            authors = authors.strip().split(', ')
            item['authors'] = authors

            yield item
Esempio n. 17
0
    def parse(self, response):

        for record in response.xpath('//div[@class=\'paper\']'):

            item = ConferenceItem()
            print record
            item['year'] = "2012"
            title = record.xpath('h2/text()').extract()
            print title
            item['title'] = title[0]

            authors = record.xpath('p[@class=\'authors\']/text()').extract()[0]
            print authors
            authors = authors.split(',')
            a = []
            for i in authors:
                a.append(i.lstrip().strip())
            item['authors'] = a

            yield item
Esempio n. 18
0
    def parse(self, response):

        for record in response.xpath('//tbody//div[@dir=\'ltr\']'):

            item = ConferenceItem()
            data = record.xpath('.//text()').extract()
            print data

            for t in range(0, len(data)):
                if t % 2 == 1:
                    # print 'authors: ', data[t]
                    continue
                else:
                    print data[t]
                    item['year'] = '2011'
                    item['title'] = re.sub('\d+\. ', '', data[t]).strip()
                    authors = data[t + 1].split(' and ')
                    print authors
                    item['authors'] = authors
                    yield item
Esempio n. 19
0
    def parse(self, response):

        for record in response.xpath('//h4/following::p[@class=\'left\']'):

            item = ConferenceItem()
            print record
            year = response.xpath(
                '//div[@class=\'content\']//p[position()=3]/text()').extract(
                )[0]
            item['year'] = year.split(', ')[1]
            data = record.xpath('a/text()').extract()
            print data
            item['title'] = data[0].strip().lstrip()
            authors = record.xpath('i/text()').extract()[0]
            print authors
            authors = authors.split(', ')
            item['authors'] = authors[0:-1]
            item['authors'].append(authors[-1])

            yield item
Esempio n. 20
0
    def parse(self, response):

        for record in response.xpath('//span/div//p[position()>1]'):

            item = ConferenceItem()
            data = record.xpath('text()').extract()
            print data
            item['year'] = '2008'
            title = data[1]
            item['title'] = title.lstrip().strip()
            authors = data[0].lstrip().strip('.').split(', ')
            au = []
            for a in authors:
                if 'and' in a:
                    au.extend(a.split(' and '))
                else:
                    au.append(a)

            item['authors'] = au
            yield item
Esempio n. 21
0
    def parse(self, response):

        for record in response.xpath('//table//tr[@class=\'header\']'):

            item = ConferenceItem()
            print record
            item['year'] = "2007"
            title = record.xpath('td/a/text()').extract()
            print title
            item['title'] = title[0].lstrip().strip()

            authors = record.xpath('following::tr[1]/td[2]/text()').extract()
            print authors
            data = []
            for a in authors:
                a = a.lstrip().strip().split(' - ')
                if a[0]:
                    data.append(a[0])
            item['authors'] = data

            yield item
Esempio n. 22
0
    def parse(self, response):

        for record in response.xpath('//div//ul//li[em]'):

            item = ConferenceItem()
            print record
            item['year'] = "2013"
            data = record.xpath('text()').extract()[0].strip()
            print data
            data = data.split(': ')
            if len(data) > 2:
                item['title'] = data[1] + ': ' + data[2]
            else:
                item['title'] = data[1]
            authors = record.xpath('em/text()').extract()[0]
            print authors
            authors = authors.split(', ')
            item['authors'] = authors[0:-1]
            item['authors'].append(authors[-1])

            yield item
Esempio n. 23
0
    def parse(self, response):

        for record in response.xpath('//span[@class=\'bodyblack\']'):

            data = record.xpath('text()').extract()
            print data
            for line in data:
                if line.strip():
                    paper = line.lstrip().strip().split(', ')
                    item = ConferenceItem()
                    title = paper[0].replace('* \"', '')
                    title = title.replace('\"', '')

                    item['title'] = title
                    authors = paper[1:-1]
                    authors.append(paper[-1])

                    item['authors'] = authors
                    item['year'] = "2007"

                    yield item
Esempio n. 24
0
    def parse(self, response):

        for record in response.xpath('//table[@class=\'content\']/tbody//tr[position()>1]'):

            item = ConferenceItem()
            print record
            item['year'] = "2006"
            title = record.xpath('td[2]/p//em//text()').extract()
            print title
            if not title:
                title = [u'CN=CPCN']
            title = title[0].lstrip().strip().replace('\n', ' ')
            item['title'] = title

            authors = record.xpath('td[position()=3 and p]/p/sup/sub/text()').extract()
            print authors
            authors = authors[0].split(', ')
            authors[-1] = authors[-1].replace('\n', ' ')
            item['authors'] = authors

            yield item
Esempio n. 25
0
    def parse(self, response):

        for record in response.xpath('//tbody//div[@dir=\'ltr\']'):

            item = ConferenceItem()
            data = record.xpath('.//text()').extract()

            for t in range(0, len(data)):
                if t % 2 == 1:
                    continue
                else:
                    print data[t]
                    item['year'] = '2014'
                    item['title'] = data[t]
                    authors = data[t + 1].split(' , ')
                    print authors
                    au = []
                    for a in authors:
                        au.append(re.sub('\(.*\)', '', a).strip())
                    item['authors'] = au
                    yield item
Esempio n. 26
0
 def lastparse(
     self, response
 ):  #http://kokkai.ndl.go.jp/SENTAKU/syugiin/164/0158/16405240158003c.html
     conferenceItem = ConferenceItem()
     conferenceItem['Times'] = response.meta['time'].encode('utf8')
     conferenceItem['Department'] = response.meta['department'].encode(
         'utf8')
     conferenceItem['Contents'] = response.xpath('//body').extract()
     conferenceItem['Years'] = response.meta['year'].encode('utf8')
     conferenceItem['Houses'] = response.meta['house'].encode('utf8')
     # print conferenceItem['Years'],conferenceItem['Times'],conferenceItem['Department'],conferenceItem['Houses']
     # print conferenceItem['Houses']
     # mkdir() # add your file path
     folder = mkdir('../database/')
     file = open(folder + response.meta['file_name'], 'wb')
     for content in conferenceItem['Contents']:
         file.write(conferenceItem['Times'] + '\n')
         file.write(conferenceItem['Department'] + '\n')
         file.write(conferenceItem['Years'] + '\n')
         file.write(conferenceItem['Houses'] + '\n')
         file.write(content.encode('utf8'))
     return conferenceItem
Esempio n. 27
0
    def parse(self, response):

        for record in response.xpath('//div//p'):

            item = ConferenceItem()
            print record
            item['year'] = "2011"
            title = record.xpath('span[@class=\'title\']/a/text()').extract()[0].strip()
            print title
            item['title'] = title

            print record.xpath('span[@class=\'authors\']//text()').extract()
            authors = []
            for a in record.xpath('span[@class=\'authors\']//text()').extract():
                if 'papers' not in a:
                    data = a.split(',')
                    for i in data:
                        if i.lstrip().rstrip():
                            authors.append(i.lstrip().strip())
            item['authors'] = authors

            yield item
Esempio n. 28
0
    def parse(self, response):

        for record in response.xpath('//ul/li'):

            item = ConferenceItem()
            item['year'] = '2014'
            item['title'] = record.xpath('span/text()').extract()[0]

            authors = record.xpath('text()').extract()[0].strip().lstrip(
                '.\n +')
            authors = authors.split(';')
            print authors
            data = []
            for author in authors:
                author = author.split(',')[0].lstrip()
                print author
                if author != '':
                    data.append(author)

            item['authors'] = data

            yield item
Esempio n. 29
0
    def parse(self, response):

        for record in response.xpath('//tr/td/h3'):

            item = ConferenceItem()
            print record
            item['year'] = "2010"
            title = record.xpath('text()').extract()
            print title
            item['title'] = title[0].strip()

            authors = record.xpath(
                'following-sibling::p/em/text()').extract()[0]
            print authors
            authors = authors.split('; ')
            data = []
            for a in authors:
                data.append(re.sub('\(.*\)', '', a).strip())

            item['authors'] = data

            yield item
Esempio n. 30
0
 def lastparse(
     self, response
 ):  #http://kokkai.ndl.go.jp/SENTAKU/syugiin/164/0158/16405240158003c.html
     conferenceItem = ConferenceItem()
     conferenceItem['Times'] = response.meta['time'].encode('utf8')
     conferenceItem['Department'] = response.meta['department'].encode(
         'utf8')
     conferenceItem['Contents'] = response.xpath('//body').extract()
     conferenceItem['Years'] = response.meta['year'].encode('utf8')
     conferenceItem['Houses'] = response.meta['house'].encode('utf8')
     # print conferenceItem['Years'],conferenceItem['Times'],conferenceItem['Department'],conferenceItem['Houses']
     # print conferenceItem['Houses']
     file = open(
         '/home/dreamer/documents/code/database/conference/syugiin_1970_1999/'
         + response.meta['file_name'], 'wb')
     for content in conferenceItem['Contents']:
         file.write(conferenceItem['Times'] + '\n')
         file.write(conferenceItem['Department'] + '\n')
         file.write(conferenceItem['Years'] + '\n')
         file.write(conferenceItem['Houses'] + '\n')
         file.write(content.encode('utf8'))
     return conferenceItem