Python MapComposeの例、scrapy.contrib.loader.processor.MapCompose Pythonの例

コード例 #1

0

ファイルを表示

ファイル: test_contrib_loader.py プロジェクト: reenvs/self-summary

 class ChildDefaultedItemLoader(DefaultedItemLoader):
     name_in = MapCompose(DefaultedItemLoader.default_input_processor, unicode.swapcase)

コード例 #2

0

ファイルを表示

    def parse_item(self, response):
        if response.xpath(
                '//div[@id="mainContent"]/section[@id="product-summary"]/div[@id="product-action"]/section[@id="product-price"]/div[contains(@class, "s20")]'
        ):
            # See Price in Cart!
            return

        loader = self.get_product_item_loader_with_default_values(response)
        loader.description_out = JoinExcludingEmptyValues('\n')
        loader.sale_price_out = TakeFirst()

        values_from_list = response.meta.get('values_from_list', {})
        reviews = response.meta.get('reviews', [])
        for key, value in values_from_list.iteritems():
            loader.add_value(key, value)

        loader.add_value('reviews', reviews)

        loader.add_xpath(
            'brand',
            '//div[@id="mainContent"]/section[@id="product-summary"]/div[@id="product-specification"]/h2/a/text()'
        )
        loader.add_xpath(
            'title',
            '//div[@id="mainContent"]/section[@id="product-summary"]/div[@id="product-specification"]/h1/text()'
        )
        loader.add_xpath(
            'description',
            '//div[@id="mainContent"]/section[@id="product-summary"]/div[@id="product-specification"]/p[contains(@class, "red")]'
        )
        loader.add_xpath('description', '//div[@class="prodOverview-section"]')
        loader.add_xpath(
            'original_price',
            '//div[@id="mainContent"]/section[@id="product-summary"]/div[@id="product-action"]/section[@id="product-msrp"]/div[2]/text()'
        )
        loader.add_xpath(
            'sale_price',
            '//div[@id="mainContent"]/section[@id="product-summary"]/div[@id="product-action"]/section[@id="super-special-price"]/div[2]/b/text()'
        )
        loader.add_xpath(
            'sale_price',
            '//div[@id="mainContent"]/section[@id="product-summary"]/div[@id="product-action"]/section[@id="product-price"]/div[2]/text()'
        )
        loader.add_xpath(
            'sizes',
            '//div[@id="mainContent"]/section[@id="product-summary"]/div[@id="product-specification"]/ul[@id="product-specs-list"]/li[4]/text()',
            re=ur'포장 수량: (.*)')
        loader.add_xpath(
            'sizes',
            '//div[@id="mainContent"]/section[@id="product-summary"]/div[@id="product-specification"]/ul[@id="product-specs-list"]/li[5]/text()',
            re=ur'포장 수량: (.*)')

        #images
        if response.xpath(
                '//div[@id="mainContent"]/section[@id="product-summary"]//div[@id="product-image"]/div[@class="smImHolder"]/div[@class="prod-im-sm-front"]'
        ):
            for selector in response.xpath(
                    '//div[@id="mainContent"]/section[@id="product-summary"]//div[@id="product-image"]/div[@class="smImHolder"]/div[@class="prod-im-sm-front"]'
            ):
                image_loader = ProductImageLoader(response=response,
                                                  selector=selector)
                image_loader.add_xpath('thumbnail', 'a/img/@src')
                image_loader.add_xpath('normal_size', 'a/@href')
                image_loader.add_xpath('zoomed', 'a/@href')
                loader.add_value('images', image_loader.load_item())
        else:
            for selector in response.xpath(
                    '//div[@id="mainContent"]/section[@id="product-summary"]//div[@id="product-image"]/div[contains(@class, "prod-im-big")]'
            ):
                image_loader = ProductImageLoader(response=response,
                                                  selector=selector)
                image_loader.add_xpath(
                    'thumbnail', 'a/@href',
                    MapCompose(lambda url: url.replace('/l/', '/b/')))
                image_loader.add_xpath('normal_size', 'a/@href')
                image_loader.add_xpath('zoomed', 'a/@href')
                loader.add_value('images', image_loader.load_item())

        yield loader.load_item()

コード例 #3

0

ファイルを表示

ファイル: moviefone_-_dvd_1.py プロジェクト: yuandra/scraperwiki-scraper-vault

class WebsiteLoader(XPathItemLoader):
    default_item_class = Website
    default_input_processor = MapCompose(lambda x: x.strip())
    default_output_processor = TakeFirst()

コード例 #4

0

ファイルを表示

class SpeakerLoader(XPathItemLoader):
    default_item_class = SpeakerItem
    default_input_processor = MapCompose(remove_tags, unquote_markup, unicode.strip)
    default_output_processor = Join()

コード例 #5

0

ファイルを表示

        old_locale = localelib.getlocale(localelib.LC_ALL)
        localelib.setlocale(localelib.LC_ALL, locale)

    time_s = time.strptime(value, format)
    dt = datetime.datetime(*time_s[0:5])
    # 1900 is the default year from strptime, means no year parsed
    if dt.year == 1900:
        dt = dt.replace(year=datetime.datetime.utcnow().year)

    if locale:
        localelib.setlocale(localelib.LC_ALL, old_locale)

    return dt


def to_date(value, format, locale=None):
    return to_datetime(value, format, locale).date()


def to_time(value, format):
    time_s = time.strptime(value, format)
    return datetime.time(time_s[3], time_s[4])


# defaults

default_input_processor = MapCompose(unquote_markup, replace_br, remove_tags,
                                     replace_escape, strip, clean_spaces)

default_output_processor = TakeFirst()

コード例 #6

0

ファイルを表示

class NumberField(PredefinedField):
    defaults = {
        'input_processor': MapCompose(float),
        'output_processor': SingleValue()
    }

コード例 #7

0

ファイルを表示

class UrlField(PredefinedField):
    defaults = {
        'input_processor': MapCompose(get_absolute_url),
        'output_processor': SingleValue()
    }

コード例 #8

0

ファイルを表示

ファイル: test_contrib_loader.py プロジェクト: reenvs/self-summary

 def test_mapcompose(self):
     filter_world = lambda x: None if x == 'world' else x
     proc = MapCompose(filter_world, unicode.upper)
     self.assertEqual(proc([u'hello', u'world', u'this', u'is', u'scrapy']),
                      [u'HELLO', u'THIS', u'IS', u'SCRAPY'])

コード例 #9

0

ファイルを表示

ファイル: test_contrib_loader.py プロジェクト: reenvs/self-summary

class TestXPathItemLoader(XPathItemLoader):
    default_item_class = TestItem
    name_in = MapCompose(lambda v: v.title())

コード例 #10

0

ファイルを表示

ファイル: test_contrib_loader.py プロジェクト: reenvs/self-summary

 class ChildItemLoader(TestItemLoader):
     url_in = MapCompose(processor_with_args, key=u'val')

コード例 #11

0

ファイルを表示

ファイル: test_contrib_loader.py プロジェクト: reenvs/self-summary

 class ChildItemLoader(TestItemLoader):
     url_in = MapCompose(processor)

コード例 #12

0

ファイルを表示

ファイル: test_contrib_loader.py プロジェクト: reenvs/self-summary

class DefaultedItemLoader(NameItemLoader):
    default_input_processor = MapCompose(lambda v: v[:-1])

コード例 #13

0

ファイルを表示

ファイル: test_contrib_loader.py プロジェクト: reenvs/self-summary

class TestItemLoader(NameItemLoader):
    name_in = MapCompose(lambda v: v.title())

コード例 #14

0

ファイルを表示

ファイル: test_contrib_loader.py プロジェクト: reenvs/self-summary

 class TestItemLoader(ItemLoader):
     default_item_class = TestItem
     name_out = MapCompose(float)

コード例 #15

0

ファイルを表示

ファイル: items.py プロジェクト: timbortnik/dabi-scraper

class TakeFirstItemLoader(ItemLoader):
    default_output_processor = TakeFirst()
    default_input_processor = MapCompose(unicode.strip)

コード例 #16

0

ファイルを表示

ファイル: jonas_work.py プロジェクト: orazionelson/spider-farm

    def parse(self, response):
        self.log("Our starting URL is %s"%self.start_urls)

        def absolutize_url(rel_url):
            return urljoin(response.url, rel_url)
            
        l = ItemLoader(item=JonasWorkItem(), response=response)
        
        l.add_xpath('permalink', '/html/body/div[1]/div[3]/fieldset[2]/p/a/text()')
        
        l.add_xpath('title', '//td[@class="titre"]/text()')
        l.add_xpath('author', '//td[@class="auteur"]/text()')
        l.add_xpath('incipit', '//td[@class="incipit"]/text()')
        l.add_xpath('shape', '//table[@class="table_identification"]/tr[8]/td[2]/text()')
        
        r = re.compile(u'(.*) (?:\((.*)\))')
        com_text = response.xpath('//table[@class="table_identification"]/tr[12]/td[2]/text()').extract()[0]
        res = r.search(com_text)
        if res:
            res = map(lambda x: x if x else u'', res.groups()) # replace None with u''
            composition_period = res[0]
            note_work = res[1]
        else:
            composition_period = u''
            note_work = u''
            self.log('Error while parsing composition period.\n%s'%com_text)
        l.add_value('composition_period', composition_period)
        l.add_value('note_work', note_work)
        
        l.add_xpath('language', '//table[@class="table_identification"]/tr[13]/td[2]/text()')
        l.add_xpath('other_authors', '//table[@class="table_autres"]/tr/td[2]/ul/li/a/span/text()')
        l.add_xpath('role', '//table[@class="table_autres"]/tr/td[2]/ul/li/span[1]/text()')
        l.add_xpath('hierarchy', '//ul[@class="thesaurus"]//text()')

        l.add_xpath('associated_link_detailed_works',
                    '//div[@class="association"]/a/@href',
                    MapCompose(absolutize_url)
                    )
        l.add_xpath('associated_author', '//div[@class="association"]/span[@class="curauteuroeuvre"]/text()')
        l.add_xpath('associated_title', '//div[@class="association"]/span[@class="curtitreoeuvre"]/text()')
        l.add_xpath('associated_incipit', '//div[@class="association"]/span[@class="curincipitoeuvre"]/text()')

        r = re.compile(u'(\d*) témoin')
        wit = response.xpath('//div[@id="temoins"]/div[2]/text()').extract()[0]
        wit = wit.replace(u'\xa0', u' ') # u'\xa0' is nonbreaking space
        res = r.search(wit)
        if res:
            num_wit = res.group(1)
        else:
            num_wit = u''
            self.log("Error parsing number of witnesses:\n%s"%wit)
        l.add_value('number_of_witnesses', num_wit)

        l.add_xpath('manuscripts', '//div[@class="un_temoin temoin"]')
        
        l.add_xpath('bibliography_link',
                    '//div[@id="blocBibliographies"]/div/a/@href',
                    MapCompose(absolutize_url)
                    )
        bib = []
        for x in response.xpath('//div[@class="bibliolink"]').extract():
            bib.append(extract_text(x)[0].replace(u'\xa0', u' ')) # u'\xa0' is nonbreaking space
        l.add_value('bibliography', bib)

        return l.load_item()

コード例 #17

0

ファイルを表示

class CharField(PredefinedField):
    defaults = {
        'input_processor': MapCompose(lambda chars: chars.strip()),
        'output_processor': SingleValue(),
        'default_value': ''
    }

コード例 #18

0

ファイルを表示

class MyItemLoader(ItemLoader):
    default_input_processor = MapCompose(lambda s: s.strip())
    default_output_processor = TakeFirst()
    description_out = Join()

コード例 #19

0

ファイルを表示

class DateField(PredefinedField):
    defaults = {
        'input_processor': MapCompose(lambda date: date.strftime("%Y-%m-%d")),
        'output_processor': SingleValue()
    }

コード例 #20

0

ファイルを表示

class PostLoader(XPathItemLoader):
    default_output_processor = TakeFirst()

    zeta_id_in = MapCompose(unicode.strip, extract_numbers)
    ip_address_in = MapCompose(unicode.strip, extract_ip_address)
    date_posted_in = MapCompose(unicode.strip, to_datetime_long)

コード例 #21

0

ファイルを表示

ファイル: productloader.py プロジェクト: oceancloud82/scraping

class WindowsCleaningProductLoader(ProductLoader):
    name_in = MapCompose(unicode, remove_entities)
    price_in = MapCompose(extract_price2uk)

コード例 #22

0

ファイルを表示

class RawPostLoader(XPathItemLoader):
    default_output_processor = TakeFirst()

    zeta_id_in = MapCompose(unicode.strip, extract_numbers)

コード例 #23

0

ファイルを表示

ファイル: Cian.py プロジェクト: Peshehod87/FlatPriceForecast

class FlatLoader(XPathItemLoader):
    default_input_processor = MapCompose(
        lambda s: re.sub('\s+', ' ', s.strip()))
    default_output_processor = TakeFirst()

コード例 #24

0

ファイルを表示

class BasicItemLoader(ItemLoader):
    default_input_processor = MapCompose(remove_entities, string.strip, cast_string)
    default_output_processor = TakeFirst()

コード例 #25

0

ファイルを表示

class ProyectoItemLoader(XPathItemLoader):
    default_item_class = ProyectoItem
    default_input_processor = MapCompose(fix_space, unicode.strip)
    default_output_processor = TakeFirst()

    tipo_in = MapCompose(fix_space, unicode.strip, normalize_tipo_proyecto)
    camara_origen_in = MapCompose(fix_space, unicode.strip, normalize_camara)
    camara_origen_expediente_in = MapCompose(fix_space, unicode.strip, normalize_codigo_expediente)
    origen_in = MapCompose(fix_space, unicode.strip, normalize_proyecto_origen)
    reproduccion_expediente_in = MapCompose(fix_space, unicode.strip, partial(normalize_codigo_expediente, allow_empty=True))
    camara_revisora_in = MapCompose(fix_space, unicode.strip, partial(normalize_camara, allow_empty=True))
    camara_revisora_expediente_in = MapCompose(fix_space, unicode.strip, partial(normalize_codigo_expediente, allow_empty=True))
    ley_numero_in = MapCompose(fix_space, unicode.strip, digits_only)
    mensaje_codigo_in = MapCompose(fix_space, unicode.strip, partial(normalize_codigo_mensaje, allow_empty=True))


    publicacion_en_in = MapCompose(fix_space, unicode.strip, partial(normalize_publicacion_en, allow_empty=True))
    publicacion_fecha_in = MapCompose(fix_space, unicode.strip, spanish_date)
    publicacion_fecha_out = Compose(lambda v: v[0].isoformat())

    comisiones_diputados_out = Identity()
    comisiones_senadores_out = Identity()

コード例 #26

0

ファイルを表示

class CrawledItem(Item):
    name = Field(
        input_processor=MapCompose(unquote_markup, strip_space),
        output_processor=TakeFirst(),
    )
    icon_link = Field(
        input_processor=MapCompose(strip_space),
        output_processor=TakeFirst(),
    )
    source = Field(output_processor=TakeFirst(), )
    source_link = Field(output_processor=TakeFirst(), )
    rating = Field(output_processor=Join(), )
    #    rating = Field()
    version = Field(
        default='',
        input_processor=MapCompose(unquote_markup, strip_space),
        output_processor=TakeFirst(),
    )
    developer = Field(
        default='',
        input_processor=MapCompose(unquote_markup, strip_space),
        output_processor=TakeFirst(),
    )
    sdk_support = Field(
        default='',
        input_processor=MapCompose(unquote_markup, strip_space),
        output_processor=Join(),
    )
    category = Field(
        default='',
        input_processor=MapCompose(unquote_markup, strip_space),
        output_processor=TakeFirst(),
    )
    screen_support = Field(
        default='',
        input_processor=MapCompose(unquote_markup, strip_space),
        output_processor=Join(),
    )
    apk_size = Field(
        default='',
        input_processor=MapCompose(strip_space),
        output_processor=TakeFirst(),
    )
    language = Field(
        default='',
        input_processor=MapCompose(unquote_markup, strip_space),
        output_processor=TakeFirst(),
    )
    publish_date = Field(output_processor=TakeFirst(), )
    downloads = Field(
        default=0,
        input_processor=MapCompose(strip_space),
        output_processor=TakeFirst(),
    )
    #    downloads = Field()
    description = Field(
        default='',
        input_processor=MapCompose(unquote_markup, remove_comments,
                                   replace_escape_chars, strip_space),
        output_processor=Join(),
    )
    images = Field(
        default='',
        output_processor=Join(),
    )
    qr_link = Field(
        default='',
        output_processor=TakeFirst(),
    )
    download_link = Field(
        default='',
        input_processor=MapCompose(strip_space),
        output_processor=TakeFirst(),
    )

コード例 #27

0

ファイルを表示

ファイル: pubmed_spider.py プロジェクト: izhbannikov/BacNet

class PubmedLoader(XPathItemLoader):
    default_input_processor = MapCompose(
        lambda s: re.sub('\s+', ' ', s.strip()))
    default_output_processor = Join()
    state_in = MapCompose(lambda s: not re.match(u'\s*', s))
    pass

コード例 #28

0

ファイルを表示

class ProductLoader(ProductLoaderWithNameStrip):
    sku_in = MapCompose(unicode, unicode.strip, unicode.lower,
                        lambda v: v.replace('-', ''),
                        lambda v: v.replace(' ', ''))

コード例 #29

0

ファイルを表示

class ReportItemLoader(XmlXPathItemLoader):

    default_item_class = ReportItem
    default_output_processor = TakeFirst()

    symbol_in = MapCompose(ExtractText(), unicode.upper)
    symbol_out = Compose(get_symbol)

    amend_in = MapCompose(ExtractText(), str_to_bool)
    amend_out = Compose(get_amend)

    period_focus_in = MapCompose(ExtractText(), unicode.upper)
    period_focus_out = TakeFirst()

    revenues_in = MapCompose(MatchEndDate(float))
    revenues_out = Compose(imd_filter_member, imd_mult,
                           ImdSumMembersOr(imd_get_revenues))

    net_income_in = MapCompose(MatchEndDate(float))
    net_income_out = Compose(imd_filter_member, imd_mult, imd_get_net_income)

    op_income_in = MapCompose(MatchEndDate(float))
    op_income_out = Compose(imd_filter_member, imd_mult, imd_get_op_income)

    eps_basic_in = MapCompose(MatchEndDate(float))
    eps_basic_out = Compose(ImdSumMembersOr(imd_get_per_share_value),
                            lambda x: x if x < MAX_PER_SHARE_VALUE else None)

    eps_diluted_in = MapCompose(MatchEndDate(float))
    eps_diluted_out = Compose(ImdSumMembersOr(imd_get_per_share_value),
                              lambda x: x if x < MAX_PER_SHARE_VALUE else None)

    dividend_in = MapCompose(MatchEndDate(float))
    dividend_out = Compose(
        imd_get_per_share_value, lambda x: x
        if x < MAX_PER_SHARE_VALUE and x > 0.0 else 0.0)

    assets_in = MapCompose(MatchEndDate(float))
    assets_out = Compose(imd_filter_member, imd_mult, imd_max)

    cur_assets_in = MapCompose(MatchEndDate(float))
    cur_assets_out = Compose(imd_filter_member, imd_mult, imd_max)

    cur_liab_in = MapCompose(MatchEndDate(float))
    cur_liab_out = Compose(imd_filter_member, imd_mult, imd_max)

    long_liab_in = MapCompose(MatchEndDate(float))
    long_liab_out = Compose(imd_filter_member, imd_mult, imd_max)

    property_in = MapCompose(MatchEndDate(float))
    property_out = Compose(imd_filter_member, imd_mult, imd_max)

    shares_in = MapCompose(MatchEndDate(float))
    shares_out = Compose(imd_filter_member, imd_mult, imd_max)

    equity_in = MapCompose(MatchEndDate(float))
    equity_out = Compose(imd_filter_member, imd_mult, imd_get_equity)

    cash_in = MapCompose(MatchEndDate(float))
    cash_out = Compose(imd_filter_member, imd_mult, imd_max)

    cash_flow_op_in = MapCompose(MatchEndDate(float, True))
    cash_flow_op_out = Compose(imd_filter_member, imd_mult, imd_get_cash_flow)

    cash_flow_inv_in = MapCompose(MatchEndDate(float, True))
    cash_flow_inv_out = Compose(imd_filter_member, imd_mult, imd_get_cash_flow)

    cash_flow_fin_in = MapCompose(MatchEndDate(float, True))
    cash_flow_fin_out = Compose(imd_filter_member, imd_mult, imd_get_cash_flow)

    def __init__(self, *args, **kwargs):
        response = kwargs.get('response')
        if len(response.body) > THRESHOLD_TO_CLEAN:
            # Remove some useless text to reduce memory usage
            body, __ = RE_XML_GARBAGE.subn(lambda m: '><', response.body)
            response = response.replace(body=body)
            kwargs['response'] = response

        super(ReportItemLoader, self).__init__(*args, **kwargs)

        symbol = self._get_symbol()
        print("_get_symbol: " + symbol)
        end_date = self._get_doc_end_date()
        fiscal_year = self._get_doc_fiscal_year()
        doc_type = self._get_doc_type()

        # ignore document that is not 10-Q or 10-K
        if not (doc_type and doc_type.split('/')[0] in ('10-Q', '10-K')):
            return

        # some documents set their amendment flag in DocumentType, e.g., '10-Q/A',
        # instead of setting it in AmendmentFlag
        amend = None
        if doc_type.endswith('/A'):
            amend = True
            doc_type = doc_type[0:-2]

        self.context.update({'end_date': end_date, 'doc_type': doc_type})

        self.add_xpath('symbol', '//dei:TradingSymbol')
        self.add_value('symbol', symbol)

        if amend:
            self.add_value('amend', True)
        else:
            self.add_xpath('amend', '//dei:AmendmentFlag')

        if doc_type == '10-K':
            period_focus = 'FY'
        else:
            period_focus = self._get_period_focus(end_date)

        if not fiscal_year and period_focus:
            fiscal_year = self._guess_fiscal_year(end_date, period_focus)

        self.add_value('period_focus', period_focus)
        self.add_value('fiscal_year', fiscal_year)
        self.add_value('end_date', end_date)
        self.add_value('doc_type', doc_type)

        self.add_xpaths('revenues', [
            '//us-gaap:SalesRevenueNet', '//us-gaap:Revenues',
            '//us-gaap:SalesRevenueGoodsNet',
            '//us-gaap:SalesRevenueServicesNet',
            '//us-gaap:RealEstateRevenueNet',
            '//*[local-name()="NetRevenuesIncludingNetInterestIncome"]',
            '//*[contains(local-name(), "TotalRevenues") and contains(local-name(), "After")]',
            '//*[contains(local-name(), "TotalRevenues")]',
            '//*[local-name()="InterestAndDividendIncomeOperating" or local-name()="NoninterestIncome"]'
        ])
        self.add_xpath('revenues', '//us-gaap:FinancialServicesRevenue')
        self.add_xpath(
            'revenues',
            '//us-gaap:RevenueFromContractWithCustomerExcludingAssessedTax')

        self.add_xpaths('net_income', [
            '//*[contains(local-name(), "NetLossIncome") and contains(local-name(), "Corporation")]',
            '//*[local-name()="NetIncomeLossAvailableToCommonStockholdersBasic" or local-name()="NetIncomeLoss"]',
            '//us-gaap:ProfitLoss',
            '//us-gaap:IncomeLossFromContinuingOperations',
            '//*[contains(local-name(), "IncomeLossFromContinuingOperations") and not(contains(local-name(), "Per"))]',
            '//*[contains(local-name(), "NetIncomeLoss")]',
            '//*[starts-with(local-name(), "NetIncomeAttributableTo")]'
        ])

        self.add_xpaths('op_income', ['//us-gaap:OperatingIncomeLoss'])

        self.add_xpaths('eps_basic', [
            '//us-gaap:EarningsPerShareBasic',
            '//us-gaap:IncomeLossFromContinuingOperationsPerBasicShare',
            '//us-gaap:IncomeLossFromContinuingOperationsPerBasicAndDilutedShare',
            '//*[contains(local-name(), "NetIncomeLoss") and contains(local-name(), "Per") and contains(local-name(), "Common")]',
            '//*[contains(local-name(), "Earnings") and contains(local-name(), "Per") and contains(local-name(), "Basic")]',
            '//*[local-name()="IncomePerShareFromContinuingOperationsAvailableToCompanyStockholdersBasicAndDiluted"]',
            '//*[contains(local-name(), "NetLossPerShare")]',
            '//*[contains(local-name(), "NetIncome") and contains(local-name(), "Per") and contains(local-name(), "Basic")]',
            '//*[local-name()="BasicEarningsAttributableToStockholdersPerCommonShare"]',
            '//*[local-name()="Earningspersharebasicanddiluted"]',
            '//*[contains(local-name(), "PerCommonShareBasicAndDiluted")]',
            '//*[local-name()="NetIncomeLossAttributableToCommonStockholdersBasicAndDiluted"]',
            '//us-gaap:NetIncomeLossAvailableToCommonStockholdersBasic',
            '//*[local-name()="NetIncomeLossEPS"]',
            '//*[local-name()="NetLoss"]'
        ])

        self.add_xpaths('eps_diluted', [
            '//us-gaap:EarningsPerShareDiluted',
            '//us-gaap:IncomeLossFromContinuingOperationsPerDilutedShare',
            '//us-gaap:IncomeLossFromContinuingOperationsPerBasicAndDilutedShare',
            '//*[contains(local-name(), "Earnings") and contains(local-name(), "Per") and contains(local-name(), "Diluted")]',
            '//*[local-name()="IncomePerShareFromContinuingOperationsAvailableToCompanyStockholdersBasicAndDiluted"]',
            '//*[contains(local-name(), "NetLossPerShare")]',
            '//*[contains(local-name(), "NetIncome") and contains(local-name(), "Per") and contains(local-name(), "Diluted")]',
            '//*[local-name()="DilutedEarningsAttributableToStockholdersPerCommonShare"]',
            '//us-gaap:NetIncomeLossAvailableToCommonStockholdersDiluted',
            '//*[contains(local-name(), "PerCommonShareBasicAndDiluted")]',
            '//*[local-name()="NetIncomeLossAttributableToCommonStockholdersBasicAndDiluted"]',
            '//us-gaap:EarningsPerShareBasic',
            '//*[local-name()="NetIncomeLossEPS"]',
            '//*[local-name()="NetLoss"]'
        ])

        self.add_xpaths('dividend', [
            '//us-gaap:CommonStockDividendsPerShareDeclared',
            '//us-gaap:CommonStockDividendsPerShareCashPaid'
        ])

        # if dividend isn't found in doc, assume it's 0
        self.add_value('dividend', 0.0)

        self.add_xpaths('assets', [
            '//us-gaap:Assets', '//us-gaap:AssetsNet',
            '//us-gaap:LiabilitiesAndStockholdersEquity'
        ])

        self.add_xpaths('cur_assets', ['//us-gaap:AssetsCurrent'])

        self.add_xpaths('cur_liab', ['//us-gaap:LiabilitiesCurrent'])

        self.add_xpaths('long_liab', [
            '//us-gaap:LongTermDebtNoncurrent', '//us-gaap:LongTermDebt',
            '//us-gaap:LongTermDebtAndCapitalLeaseObligations'
        ])

        self.add_xpaths('property', ['//us-gaap:PropertyPlantAndEquipmentNet'])

        self.add_xpaths(
            'shares',
            ['//us-gaap:WeightedAverageNumberOfSharesOutstandingBasic'])

        self.add_xpaths('equity', [
            '//*[local-name()="StockholdersEquityIncludingPortionAttributableToNoncontrollingInterest" or local-name()="StockholdersEquity"]',
            '//*[local-name()="TotalCommonShareholdersEquity"]',
            '//*[local-name()="CommonShareholdersEquity"]',
            '//*[local-name()="CommonStockEquity"]',
            '//*[local-name()="TotalEquity"]',
            '//us-gaap:RetainedEarningsAccumulatedDeficit',
            '//*[contains(local-name(), "MembersEquityIncludingPortionAttributableToNoncontrollingInterest")]',
            '//us-gaap:CapitalizationLongtermDebtAndEquity',
            '//*[local-name()="TotalCapitalization"]'
        ])

        self.add_xpaths('cash', [
            '//us-gaap:CashCashEquivalentsAndFederalFundsSold',
            '//us-gaap:CashAndDueFromBanks',
            '//us-gaap:CashAndCashEquivalentsAtCarryingValue',
            '//us-gaap:Cash', '//*[local-name()="CashAndCashEquivalents"]',
            '//*[contains(local-name(), "CarryingValueOfCashAndCashEquivalents")]',
            '//*[contains(local-name(), "CashCashEquivalents")]',
            '//*[contains(local-name(), "CashAndCashEquivalents")]'
        ])

        self.add_xpaths('cash_flow_op', [
            '//us-gaap:NetCashProvidedByUsedInOperatingActivities',
            '//us-gaap:NetCashProvidedByUsedInOperatingActivitiesContinuingOperations'
        ])

        self.add_xpaths('cash_flow_inv', [
            '//us-gaap:NetCashProvidedByUsedInInvestingActivities',
            '//us-gaap:NetCashProvidedByUsedInInvestingActivitiesContinuingOperations'
        ])

        self.add_xpaths('cash_flow_fin', [
            '//us-gaap:NetCashProvidedByUsedInFinancingActivities',
            '//us-gaap:NetCashProvidedByUsedInFinancingActivitiesContinuingOperations'
        ])

    def _get_symbol(self):
        try:
            filename = self.context['response'].url.split('/')[-1]
            print("filename: " + filename)
            if filename == "a0330201910qdocument_htm.xml":
                # hard code for the report
                return "INTC"
            else:
                return filename.split('-')[0].upper()
        except IndexError:
            return None

    def _get_doc_fiscal_year(self):
        try:
            fiscal_year = self.selector.xpath(
                '//dei:DocumentFiscalYearFocus/text()')[0].extract()
            return int(fiscal_year)
        except (IndexError, ValueError):
            return None

    def _guess_fiscal_year(self, end_date, period_focus):
        # Guess fiscal_year based on document end_date and period_focus
        date = datetime.strptime(end_date, DATE_FORMAT)
        month_ranges = {
            'Q1': (2, 3, 4),
            'Q2': (5, 6, 7),
            'Q3': (8, 9, 10),
            'FY': (11, 12, 1)
        }
        month_range = month_ranges.get(period_focus)

        # Case 1: release Q1 around March, Q2 around June, ...
        # This is what most companies do
        if date.month in month_range:
            if period_focus == 'FY' and date.month == 1:
                return date.year - 1
            return date.year

        # How many days left before 10-K's release?
        days_left_table = {'Q1': 270, 'Q2': 180, 'Q3': 90, 'FY': 0}
        days_left = days_left_table.get(period_focus)

        # Other cases, assume end_date.year of its FY report equals to
        # its fiscal_year
        if days_left is not None:
            fy_date = date + timedelta(days=days_left)
            return fy_date.year

        return None

    def _get_doc_end_date(self):
        # the document end date could come from URL or document content
        # we need to guess which one is correct
        url_str = self.context['response'].url
        print(url_str)
        if "htm" in url_str:
            URLDateValid = False
        else:
            URLDateValid = True
            url_date_str = url_str.split('-')[-1].split('.')[0]
            url_date = datetime.strptime(url_date_str, '%Y%m%d')
            url_date_str = url_date.strftime(DATE_FORMAT)
            print("Date from URL" + url_date_str)

        DocDateValid = True
        try:
            doc_date_str = self.selector.xpath(
                '//dei:DocumentPeriodEndDate/text()')[0].extract()
            print("Date from doc" + doc_date_str)
            doc_date = datetime.strptime(doc_date_str, DATE_FORMAT)
        except (IndexError, ValueError):
            DocDateValid = False
            #return url_date.strftime(DATE_FORMAT)

        context_date_strs = set(
            self.selector.xpath(
                '//*[local-name()="context"]//*[local-name()="endDate"]/text()'
            ).extract())

        if DocDateValid:
            if doc_date_str in context_date_strs:
                date = doc_date
            else:
                DocDateValid = False

        if DocDateValid:
            date = doc_date
        elif URLDateValid:
            date = url_date
        else:
            print("No peirod end date is found!!")

        return date.strftime(DATE_FORMAT)

    def _get_doc_type(self):
        try:
            return self.selector.xpath(
                '//dei:DocumentType/text()')[0].extract().upper()
        except (IndexError, ValueError):
            return None

    def _get_period_focus(self, doc_end_date):
        try:
            return self.selector.xpath('//dei:DocumentFiscalPeriodFocus/text()'
                                       )[0].extract().strip().upper()
        except IndexError:
            pass

        try:
            doc_yr = doc_end_date.split('-')[0]
            yr_end_date = self.selector.xpath(
                '//dei:CurrentFiscalYearEndDate/text()')[0].extract()
            yr_end_date = yr_end_date.replace('--', doc_yr + '-')
        except IndexError:
            return None

        doc_end_date = datetime.strptime(doc_end_date, '%Y-%m-%d')
        yr_end_date = datetime.strptime(yr_end_date, '%Y-%m-%d')
        delta_days = (yr_end_date - doc_end_date).days

        if delta_days > -45 and delta_days < 45:
            return 'FY'
        elif (delta_days <= -45 and delta_days > -135) or delta_days > 225:
            return 'Q1'
        elif (delta_days <= -135
              and delta_days > -225) or (delta_days > 135
                                         and delta_days <= 225):
            return 'Q2'
        elif delta_days <= -225 or (delta_days > 45 and delta_days <= 135):
            return 'Q3'

        return 'FY'

コード例 #30

0

ファイルを表示

ファイル: test_contrib_loader.py プロジェクト: reenvs/self-summary

 class ChildItemLoader(TestItemLoader):
     name_in = MapCompose(TestItemLoader.name_in, unicode.swapcase)