コード例 #1
0
def load_characters(neighbours, blur_scale, verbose=0):
    chars_file = 'characters_%s_%s.dat' % (blur_scale, neighbours)

    if exists(chars_file):
        print 'Loading characters...'
        chars = fload(chars_file)
    else:
        print 'Going to generate character objects...'
        chars = []

        for char in sorted(listdir(IMAGES_FOLDER)):
            count = 0

            for image in sorted(listdir(IMAGES_FOLDER + char)):
                image = GrayscaleImage(IMAGES_FOLDER + char + '/' + image)
                norm = NormalizedCharacterImage(image, blur=blur_scale, \
                                                height=NORMALIZED_HEIGHT)
                character = Character(char, [], norm)
                character.get_single_cell_feature_vector(neighbours)
                chars.append(character)

                count += 1

                if verbose:
                    print 'Loaded character %s %d times' % (char, count)

        if verbose:
            print 'Saving characters...'

        fdump(chars, chars_file)

    return chars
コード例 #2
0
def load_classifier(neighbours, blur_scale, c=None, gamma=None, verbose=0):
    classifier_file = 'classifier_%s_%s.dat' \
            % (blur_scale, neighbours)
    classifier_path = DATA_FOLDER + classifier_file

    if exists(classifier_file):
        if verbose:
            print 'Loading classifier...'

        classifier = Classifier(filename=classifier_path, \
                neighbours=neighbours, verbose=verbose)
    elif c != None and gamma != None:
        if verbose:
            print 'Training new classifier...'

        classifier = Classifier(c=c, gamma=gamma, neighbours=neighbours, \
                verbose=verbose)
        learning_set = load_learning_set(neighbours, blur_scale, \
                verbose=verbose)
        classifier.train(learning_set)
        classifier.save(classifier_path)
    else:
        raise Exception('No soft margin and gamma specified.')

    return classifier
コード例 #3
0
def load_test_set(neighbours, blur_scale, verbose=0):
    test_set_file = 'test_set_%s_%s.dat' % (blur_scale, neighbours)

    if exists(test_set_file):
        if verbose:
            print 'Loading test set...'

        test_set = fload(test_set_file)

        if verbose:
            print 'Test set:', [c.value for c in test_set]
    else:
        test_set = generate_sets(neighbours, blur_scale, verbose=verbose)[1]

    return test_set
コード例 #4
0
def load_learning_set(neighbours, blur_scale, verbose=0):
    learning_set_file = 'learning_set_%s_%s.dat' % (blur_scale, neighbours)

    if exists(learning_set_file):
        if verbose:
            print 'Loading learning set...'

        learning_set = fload(learning_set_file)

        if verbose:
            print 'Learning set:', [c.value for c in learning_set]
    else:
        learning_set = generate_sets(neighbours, blur_scale, \
                verbose=verbose)[0]

    return learning_set
コード例 #5
0
ファイル: azScraper.py プロジェクト: pythonl1/scrapers
import data
import sys

start_urls = "http://www.azlyrics.com/lyrics/shakira/empire.html"

if (data.exists(start_urls)):
    sys.exit(data.exists(start_urls))
else:
    from scrapy.spider import Spider
    from scrapy.selector import Selector
    from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
    from scrapy.selector import HtmlXPathSelector
    from lyrics_az.items import LyricsAzItem

    class LyricsSpiderClass(Spider):
        name = "lyrics"
        allowed_domains = ["www.azlyrics.com"]
        start_urls = [
            "http://www.azlyrics.com/lyrics/shakira/empire.html",
        ]

        def parse(self, response):
            sel = Selector(response)
            sites = sel.xpath('//div[@id="main"]')
            items = []
            for site in sites:
                item = {}
                item['lyrics'] = "".join(site.xpath('div/text()').extract())
                item['Artist'] = "".join(site.xpath('h2/text()').extract())
                item['Song_name'] = "".join(site.xpath('b/text()').extract())
                if (len(item['lyrics']) != 0):
コード例 #6
0
ファイル: goodreadSpider.py プロジェクト: pythonl1/scrapers
import data
import sys
start_urls = "http://www.goodreads.com/quotes"
if (data.exists(start_urls)):
    sys.exit(data.exists(start_urls))  
else:
    from scrapy.spider import Spider
    from scrapy.selector import Selector
    from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
    from scrapy.selector import HtmlXPathSelector
    from scraping_goodreads.items import ScrapingGoodreadsItem
    start_urls = "http://www.goodreads.com/quotes"
    class ScrapingGoodreadsSpider(Spider): 
        name = "goodreads"
        allowed_domains = ["www.goodreads.com"]
        start_urls = "http://www.goodreads.com/quotes"
    
        def parse(self, response):
            sel = Selector(response)
            sites = sel.xpath('//div')
            items = []
            for site in sites:
                item = {}
                item['body'] = str("".join(site.xpath('div[@class="quoteText"]/text()').extract()).encode('UTF-8'))
                item['author'] = str("".join(site.xpath('div[@class="quoteText"]/a/text()').extract()).encode('UTF-8'))
                item['work'] = str("".join(site.xpath('div[@class="quoteText"]/i/a/text()').extract()).encode('UTF-8'))
                if len(item['body']) != 0: 
                    items.append(item)
            data.saveQuotes(self.start_urls,self.allowed_domains,items)
            return 0