class ChildDefaultedItemLoader(DefaultedItemLoader): name_in = MapCompose(DefaultedItemLoader.default_input_processor, unicode.swapcase)
def parse_item(self, response): if response.xpath( '//div[@id="mainContent"]/section[@id="product-summary"]/div[@id="product-action"]/section[@id="product-price"]/div[contains(@class, "s20")]' ): # See Price in Cart! return loader = self.get_product_item_loader_with_default_values(response) loader.description_out = JoinExcludingEmptyValues('\n') loader.sale_price_out = TakeFirst() values_from_list = response.meta.get('values_from_list', {}) reviews = response.meta.get('reviews', []) for key, value in values_from_list.iteritems(): loader.add_value(key, value) loader.add_value('reviews', reviews) loader.add_xpath( 'brand', '//div[@id="mainContent"]/section[@id="product-summary"]/div[@id="product-specification"]/h2/a/text()' ) loader.add_xpath( 'title', '//div[@id="mainContent"]/section[@id="product-summary"]/div[@id="product-specification"]/h1/text()' ) loader.add_xpath( 'description', '//div[@id="mainContent"]/section[@id="product-summary"]/div[@id="product-specification"]/p[contains(@class, "red")]' ) loader.add_xpath('description', '//div[@class="prodOverview-section"]') loader.add_xpath( 'original_price', '//div[@id="mainContent"]/section[@id="product-summary"]/div[@id="product-action"]/section[@id="product-msrp"]/div[2]/text()' ) loader.add_xpath( 'sale_price', '//div[@id="mainContent"]/section[@id="product-summary"]/div[@id="product-action"]/section[@id="super-special-price"]/div[2]/b/text()' ) loader.add_xpath( 'sale_price', '//div[@id="mainContent"]/section[@id="product-summary"]/div[@id="product-action"]/section[@id="product-price"]/div[2]/text()' ) loader.add_xpath( 'sizes', '//div[@id="mainContent"]/section[@id="product-summary"]/div[@id="product-specification"]/ul[@id="product-specs-list"]/li[4]/text()', re=ur'포장 수량: (.*)') loader.add_xpath( 'sizes', '//div[@id="mainContent"]/section[@id="product-summary"]/div[@id="product-specification"]/ul[@id="product-specs-list"]/li[5]/text()', re=ur'포장 수량: (.*)') #images if response.xpath( '//div[@id="mainContent"]/section[@id="product-summary"]//div[@id="product-image"]/div[@class="smImHolder"]/div[@class="prod-im-sm-front"]' ): for selector in response.xpath( '//div[@id="mainContent"]/section[@id="product-summary"]//div[@id="product-image"]/div[@class="smImHolder"]/div[@class="prod-im-sm-front"]' ): image_loader = ProductImageLoader(response=response, selector=selector) image_loader.add_xpath('thumbnail', 'a/img/@src') image_loader.add_xpath('normal_size', 'a/@href') image_loader.add_xpath('zoomed', 'a/@href') loader.add_value('images', image_loader.load_item()) else: for selector in response.xpath( '//div[@id="mainContent"]/section[@id="product-summary"]//div[@id="product-image"]/div[contains(@class, "prod-im-big")]' ): image_loader = ProductImageLoader(response=response, selector=selector) image_loader.add_xpath( 'thumbnail', 'a/@href', MapCompose(lambda url: url.replace('/l/', '/b/'))) image_loader.add_xpath('normal_size', 'a/@href') image_loader.add_xpath('zoomed', 'a/@href') loader.add_value('images', image_loader.load_item()) yield loader.load_item()
class WebsiteLoader(XPathItemLoader): default_item_class = Website default_input_processor = MapCompose(lambda x: x.strip()) default_output_processor = TakeFirst()
class SpeakerLoader(XPathItemLoader): default_item_class = SpeakerItem default_input_processor = MapCompose(remove_tags, unquote_markup, unicode.strip) default_output_processor = Join()
old_locale = localelib.getlocale(localelib.LC_ALL) localelib.setlocale(localelib.LC_ALL, locale) time_s = time.strptime(value, format) dt = datetime.datetime(*time_s[0:5]) # 1900 is the default year from strptime, means no year parsed if dt.year == 1900: dt = dt.replace(year=datetime.datetime.utcnow().year) if locale: localelib.setlocale(localelib.LC_ALL, old_locale) return dt def to_date(value, format, locale=None): return to_datetime(value, format, locale).date() def to_time(value, format): time_s = time.strptime(value, format) return datetime.time(time_s[3], time_s[4]) # defaults default_input_processor = MapCompose(unquote_markup, replace_br, remove_tags, replace_escape, strip, clean_spaces) default_output_processor = TakeFirst()
class NumberField(PredefinedField): defaults = { 'input_processor': MapCompose(float), 'output_processor': SingleValue() }
class UrlField(PredefinedField): defaults = { 'input_processor': MapCompose(get_absolute_url), 'output_processor': SingleValue() }
def test_mapcompose(self): filter_world = lambda x: None if x == 'world' else x proc = MapCompose(filter_world, unicode.upper) self.assertEqual(proc([u'hello', u'world', u'this', u'is', u'scrapy']), [u'HELLO', u'THIS', u'IS', u'SCRAPY'])
class TestXPathItemLoader(XPathItemLoader): default_item_class = TestItem name_in = MapCompose(lambda v: v.title())
class ChildItemLoader(TestItemLoader): url_in = MapCompose(processor_with_args, key=u'val')
class ChildItemLoader(TestItemLoader): url_in = MapCompose(processor)
class DefaultedItemLoader(NameItemLoader): default_input_processor = MapCompose(lambda v: v[:-1])
class TestItemLoader(NameItemLoader): name_in = MapCompose(lambda v: v.title())
class TestItemLoader(ItemLoader): default_item_class = TestItem name_out = MapCompose(float)
class TakeFirstItemLoader(ItemLoader): default_output_processor = TakeFirst() default_input_processor = MapCompose(unicode.strip)
def parse(self, response): self.log("Our starting URL is %s"%self.start_urls) def absolutize_url(rel_url): return urljoin(response.url, rel_url) l = ItemLoader(item=JonasWorkItem(), response=response) l.add_xpath('permalink', '/html/body/div[1]/div[3]/fieldset[2]/p/a/text()') l.add_xpath('title', '//td[@class="titre"]/text()') l.add_xpath('author', '//td[@class="auteur"]/text()') l.add_xpath('incipit', '//td[@class="incipit"]/text()') l.add_xpath('shape', '//table[@class="table_identification"]/tr[8]/td[2]/text()') r = re.compile(u'(.*) (?:\((.*)\))') com_text = response.xpath('//table[@class="table_identification"]/tr[12]/td[2]/text()').extract()[0] res = r.search(com_text) if res: res = map(lambda x: x if x else u'', res.groups()) # replace None with u'' composition_period = res[0] note_work = res[1] else: composition_period = u'' note_work = u'' self.log('Error while parsing composition period.\n%s'%com_text) l.add_value('composition_period', composition_period) l.add_value('note_work', note_work) l.add_xpath('language', '//table[@class="table_identification"]/tr[13]/td[2]/text()') l.add_xpath('other_authors', '//table[@class="table_autres"]/tr/td[2]/ul/li/a/span/text()') l.add_xpath('role', '//table[@class="table_autres"]/tr/td[2]/ul/li/span[1]/text()') l.add_xpath('hierarchy', '//ul[@class="thesaurus"]//text()') l.add_xpath('associated_link_detailed_works', '//div[@class="association"]/a/@href', MapCompose(absolutize_url) ) l.add_xpath('associated_author', '//div[@class="association"]/span[@class="curauteuroeuvre"]/text()') l.add_xpath('associated_title', '//div[@class="association"]/span[@class="curtitreoeuvre"]/text()') l.add_xpath('associated_incipit', '//div[@class="association"]/span[@class="curincipitoeuvre"]/text()') r = re.compile(u'(\d*) témoin') wit = response.xpath('//div[@id="temoins"]/div[2]/text()').extract()[0] wit = wit.replace(u'\xa0', u' ') # u'\xa0' is nonbreaking space res = r.search(wit) if res: num_wit = res.group(1) else: num_wit = u'' self.log("Error parsing number of witnesses:\n%s"%wit) l.add_value('number_of_witnesses', num_wit) l.add_xpath('manuscripts', '//div[@class="un_temoin temoin"]') l.add_xpath('bibliography_link', '//div[@id="blocBibliographies"]/div/a/@href', MapCompose(absolutize_url) ) bib = [] for x in response.xpath('//div[@class="bibliolink"]').extract(): bib.append(extract_text(x)[0].replace(u'\xa0', u' ')) # u'\xa0' is nonbreaking space l.add_value('bibliography', bib) return l.load_item()
class CharField(PredefinedField): defaults = { 'input_processor': MapCompose(lambda chars: chars.strip()), 'output_processor': SingleValue(), 'default_value': '' }
class MyItemLoader(ItemLoader): default_input_processor = MapCompose(lambda s: s.strip()) default_output_processor = TakeFirst() description_out = Join()
class DateField(PredefinedField): defaults = { 'input_processor': MapCompose(lambda date: date.strftime("%Y-%m-%d")), 'output_processor': SingleValue() }
class PostLoader(XPathItemLoader): default_output_processor = TakeFirst() zeta_id_in = MapCompose(unicode.strip, extract_numbers) ip_address_in = MapCompose(unicode.strip, extract_ip_address) date_posted_in = MapCompose(unicode.strip, to_datetime_long)
class WindowsCleaningProductLoader(ProductLoader): name_in = MapCompose(unicode, remove_entities) price_in = MapCompose(extract_price2uk)
class RawPostLoader(XPathItemLoader): default_output_processor = TakeFirst() zeta_id_in = MapCompose(unicode.strip, extract_numbers)
class FlatLoader(XPathItemLoader): default_input_processor = MapCompose( lambda s: re.sub('\s+', ' ', s.strip())) default_output_processor = TakeFirst()
class BasicItemLoader(ItemLoader): default_input_processor = MapCompose(remove_entities, string.strip, cast_string) default_output_processor = TakeFirst()
class ProyectoItemLoader(XPathItemLoader): default_item_class = ProyectoItem default_input_processor = MapCompose(fix_space, unicode.strip) default_output_processor = TakeFirst() tipo_in = MapCompose(fix_space, unicode.strip, normalize_tipo_proyecto) camara_origen_in = MapCompose(fix_space, unicode.strip, normalize_camara) camara_origen_expediente_in = MapCompose(fix_space, unicode.strip, normalize_codigo_expediente) origen_in = MapCompose(fix_space, unicode.strip, normalize_proyecto_origen) reproduccion_expediente_in = MapCompose(fix_space, unicode.strip, partial(normalize_codigo_expediente, allow_empty=True)) camara_revisora_in = MapCompose(fix_space, unicode.strip, partial(normalize_camara, allow_empty=True)) camara_revisora_expediente_in = MapCompose(fix_space, unicode.strip, partial(normalize_codigo_expediente, allow_empty=True)) ley_numero_in = MapCompose(fix_space, unicode.strip, digits_only) mensaje_codigo_in = MapCompose(fix_space, unicode.strip, partial(normalize_codigo_mensaje, allow_empty=True)) publicacion_en_in = MapCompose(fix_space, unicode.strip, partial(normalize_publicacion_en, allow_empty=True)) publicacion_fecha_in = MapCompose(fix_space, unicode.strip, spanish_date) publicacion_fecha_out = Compose(lambda v: v[0].isoformat()) comisiones_diputados_out = Identity() comisiones_senadores_out = Identity()
class CrawledItem(Item): name = Field( input_processor=MapCompose(unquote_markup, strip_space), output_processor=TakeFirst(), ) icon_link = Field( input_processor=MapCompose(strip_space), output_processor=TakeFirst(), ) source = Field(output_processor=TakeFirst(), ) source_link = Field(output_processor=TakeFirst(), ) rating = Field(output_processor=Join(), ) # rating = Field() version = Field( default='', input_processor=MapCompose(unquote_markup, strip_space), output_processor=TakeFirst(), ) developer = Field( default='', input_processor=MapCompose(unquote_markup, strip_space), output_processor=TakeFirst(), ) sdk_support = Field( default='', input_processor=MapCompose(unquote_markup, strip_space), output_processor=Join(), ) category = Field( default='', input_processor=MapCompose(unquote_markup, strip_space), output_processor=TakeFirst(), ) screen_support = Field( default='', input_processor=MapCompose(unquote_markup, strip_space), output_processor=Join(), ) apk_size = Field( default='', input_processor=MapCompose(strip_space), output_processor=TakeFirst(), ) language = Field( default='', input_processor=MapCompose(unquote_markup, strip_space), output_processor=TakeFirst(), ) publish_date = Field(output_processor=TakeFirst(), ) downloads = Field( default=0, input_processor=MapCompose(strip_space), output_processor=TakeFirst(), ) # downloads = Field() description = Field( default='', input_processor=MapCompose(unquote_markup, remove_comments, replace_escape_chars, strip_space), output_processor=Join(), ) images = Field( default='', output_processor=Join(), ) qr_link = Field( default='', output_processor=TakeFirst(), ) download_link = Field( default='', input_processor=MapCompose(strip_space), output_processor=TakeFirst(), )
class PubmedLoader(XPathItemLoader): default_input_processor = MapCompose( lambda s: re.sub('\s+', ' ', s.strip())) default_output_processor = Join() state_in = MapCompose(lambda s: not re.match(u'\s*', s)) pass
class ProductLoader(ProductLoaderWithNameStrip): sku_in = MapCompose(unicode, unicode.strip, unicode.lower, lambda v: v.replace('-', ''), lambda v: v.replace(' ', ''))
class ReportItemLoader(XmlXPathItemLoader): default_item_class = ReportItem default_output_processor = TakeFirst() symbol_in = MapCompose(ExtractText(), unicode.upper) symbol_out = Compose(get_symbol) amend_in = MapCompose(ExtractText(), str_to_bool) amend_out = Compose(get_amend) period_focus_in = MapCompose(ExtractText(), unicode.upper) period_focus_out = TakeFirst() revenues_in = MapCompose(MatchEndDate(float)) revenues_out = Compose(imd_filter_member, imd_mult, ImdSumMembersOr(imd_get_revenues)) net_income_in = MapCompose(MatchEndDate(float)) net_income_out = Compose(imd_filter_member, imd_mult, imd_get_net_income) op_income_in = MapCompose(MatchEndDate(float)) op_income_out = Compose(imd_filter_member, imd_mult, imd_get_op_income) eps_basic_in = MapCompose(MatchEndDate(float)) eps_basic_out = Compose(ImdSumMembersOr(imd_get_per_share_value), lambda x: x if x < MAX_PER_SHARE_VALUE else None) eps_diluted_in = MapCompose(MatchEndDate(float)) eps_diluted_out = Compose(ImdSumMembersOr(imd_get_per_share_value), lambda x: x if x < MAX_PER_SHARE_VALUE else None) dividend_in = MapCompose(MatchEndDate(float)) dividend_out = Compose( imd_get_per_share_value, lambda x: x if x < MAX_PER_SHARE_VALUE and x > 0.0 else 0.0) assets_in = MapCompose(MatchEndDate(float)) assets_out = Compose(imd_filter_member, imd_mult, imd_max) cur_assets_in = MapCompose(MatchEndDate(float)) cur_assets_out = Compose(imd_filter_member, imd_mult, imd_max) cur_liab_in = MapCompose(MatchEndDate(float)) cur_liab_out = Compose(imd_filter_member, imd_mult, imd_max) long_liab_in = MapCompose(MatchEndDate(float)) long_liab_out = Compose(imd_filter_member, imd_mult, imd_max) property_in = MapCompose(MatchEndDate(float)) property_out = Compose(imd_filter_member, imd_mult, imd_max) shares_in = MapCompose(MatchEndDate(float)) shares_out = Compose(imd_filter_member, imd_mult, imd_max) equity_in = MapCompose(MatchEndDate(float)) equity_out = Compose(imd_filter_member, imd_mult, imd_get_equity) cash_in = MapCompose(MatchEndDate(float)) cash_out = Compose(imd_filter_member, imd_mult, imd_max) cash_flow_op_in = MapCompose(MatchEndDate(float, True)) cash_flow_op_out = Compose(imd_filter_member, imd_mult, imd_get_cash_flow) cash_flow_inv_in = MapCompose(MatchEndDate(float, True)) cash_flow_inv_out = Compose(imd_filter_member, imd_mult, imd_get_cash_flow) cash_flow_fin_in = MapCompose(MatchEndDate(float, True)) cash_flow_fin_out = Compose(imd_filter_member, imd_mult, imd_get_cash_flow) def __init__(self, *args, **kwargs): response = kwargs.get('response') if len(response.body) > THRESHOLD_TO_CLEAN: # Remove some useless text to reduce memory usage body, __ = RE_XML_GARBAGE.subn(lambda m: '><', response.body) response = response.replace(body=body) kwargs['response'] = response super(ReportItemLoader, self).__init__(*args, **kwargs) symbol = self._get_symbol() print("_get_symbol: " + symbol) end_date = self._get_doc_end_date() fiscal_year = self._get_doc_fiscal_year() doc_type = self._get_doc_type() # ignore document that is not 10-Q or 10-K if not (doc_type and doc_type.split('/')[0] in ('10-Q', '10-K')): return # some documents set their amendment flag in DocumentType, e.g., '10-Q/A', # instead of setting it in AmendmentFlag amend = None if doc_type.endswith('/A'): amend = True doc_type = doc_type[0:-2] self.context.update({'end_date': end_date, 'doc_type': doc_type}) self.add_xpath('symbol', '//dei:TradingSymbol') self.add_value('symbol', symbol) if amend: self.add_value('amend', True) else: self.add_xpath('amend', '//dei:AmendmentFlag') if doc_type == '10-K': period_focus = 'FY' else: period_focus = self._get_period_focus(end_date) if not fiscal_year and period_focus: fiscal_year = self._guess_fiscal_year(end_date, period_focus) self.add_value('period_focus', period_focus) self.add_value('fiscal_year', fiscal_year) self.add_value('end_date', end_date) self.add_value('doc_type', doc_type) self.add_xpaths('revenues', [ '//us-gaap:SalesRevenueNet', '//us-gaap:Revenues', '//us-gaap:SalesRevenueGoodsNet', '//us-gaap:SalesRevenueServicesNet', '//us-gaap:RealEstateRevenueNet', '//*[local-name()="NetRevenuesIncludingNetInterestIncome"]', '//*[contains(local-name(), "TotalRevenues") and contains(local-name(), "After")]', '//*[contains(local-name(), "TotalRevenues")]', '//*[local-name()="InterestAndDividendIncomeOperating" or local-name()="NoninterestIncome"]' ]) self.add_xpath('revenues', '//us-gaap:FinancialServicesRevenue') self.add_xpath( 'revenues', '//us-gaap:RevenueFromContractWithCustomerExcludingAssessedTax') self.add_xpaths('net_income', [ '//*[contains(local-name(), "NetLossIncome") and contains(local-name(), "Corporation")]', '//*[local-name()="NetIncomeLossAvailableToCommonStockholdersBasic" or local-name()="NetIncomeLoss"]', '//us-gaap:ProfitLoss', '//us-gaap:IncomeLossFromContinuingOperations', '//*[contains(local-name(), "IncomeLossFromContinuingOperations") and not(contains(local-name(), "Per"))]', '//*[contains(local-name(), "NetIncomeLoss")]', '//*[starts-with(local-name(), "NetIncomeAttributableTo")]' ]) self.add_xpaths('op_income', ['//us-gaap:OperatingIncomeLoss']) self.add_xpaths('eps_basic', [ '//us-gaap:EarningsPerShareBasic', '//us-gaap:IncomeLossFromContinuingOperationsPerBasicShare', '//us-gaap:IncomeLossFromContinuingOperationsPerBasicAndDilutedShare', '//*[contains(local-name(), "NetIncomeLoss") and contains(local-name(), "Per") and contains(local-name(), "Common")]', '//*[contains(local-name(), "Earnings") and contains(local-name(), "Per") and contains(local-name(), "Basic")]', '//*[local-name()="IncomePerShareFromContinuingOperationsAvailableToCompanyStockholdersBasicAndDiluted"]', '//*[contains(local-name(), "NetLossPerShare")]', '//*[contains(local-name(), "NetIncome") and contains(local-name(), "Per") and contains(local-name(), "Basic")]', '//*[local-name()="BasicEarningsAttributableToStockholdersPerCommonShare"]', '//*[local-name()="Earningspersharebasicanddiluted"]', '//*[contains(local-name(), "PerCommonShareBasicAndDiluted")]', '//*[local-name()="NetIncomeLossAttributableToCommonStockholdersBasicAndDiluted"]', '//us-gaap:NetIncomeLossAvailableToCommonStockholdersBasic', '//*[local-name()="NetIncomeLossEPS"]', '//*[local-name()="NetLoss"]' ]) self.add_xpaths('eps_diluted', [ '//us-gaap:EarningsPerShareDiluted', '//us-gaap:IncomeLossFromContinuingOperationsPerDilutedShare', '//us-gaap:IncomeLossFromContinuingOperationsPerBasicAndDilutedShare', '//*[contains(local-name(), "Earnings") and contains(local-name(), "Per") and contains(local-name(), "Diluted")]', '//*[local-name()="IncomePerShareFromContinuingOperationsAvailableToCompanyStockholdersBasicAndDiluted"]', '//*[contains(local-name(), "NetLossPerShare")]', '//*[contains(local-name(), "NetIncome") and contains(local-name(), "Per") and contains(local-name(), "Diluted")]', '//*[local-name()="DilutedEarningsAttributableToStockholdersPerCommonShare"]', '//us-gaap:NetIncomeLossAvailableToCommonStockholdersDiluted', '//*[contains(local-name(), "PerCommonShareBasicAndDiluted")]', '//*[local-name()="NetIncomeLossAttributableToCommonStockholdersBasicAndDiluted"]', '//us-gaap:EarningsPerShareBasic', '//*[local-name()="NetIncomeLossEPS"]', '//*[local-name()="NetLoss"]' ]) self.add_xpaths('dividend', [ '//us-gaap:CommonStockDividendsPerShareDeclared', '//us-gaap:CommonStockDividendsPerShareCashPaid' ]) # if dividend isn't found in doc, assume it's 0 self.add_value('dividend', 0.0) self.add_xpaths('assets', [ '//us-gaap:Assets', '//us-gaap:AssetsNet', '//us-gaap:LiabilitiesAndStockholdersEquity' ]) self.add_xpaths('cur_assets', ['//us-gaap:AssetsCurrent']) self.add_xpaths('cur_liab', ['//us-gaap:LiabilitiesCurrent']) self.add_xpaths('long_liab', [ '//us-gaap:LongTermDebtNoncurrent', '//us-gaap:LongTermDebt', '//us-gaap:LongTermDebtAndCapitalLeaseObligations' ]) self.add_xpaths('property', ['//us-gaap:PropertyPlantAndEquipmentNet']) self.add_xpaths( 'shares', ['//us-gaap:WeightedAverageNumberOfSharesOutstandingBasic']) self.add_xpaths('equity', [ '//*[local-name()="StockholdersEquityIncludingPortionAttributableToNoncontrollingInterest" or local-name()="StockholdersEquity"]', '//*[local-name()="TotalCommonShareholdersEquity"]', '//*[local-name()="CommonShareholdersEquity"]', '//*[local-name()="CommonStockEquity"]', '//*[local-name()="TotalEquity"]', '//us-gaap:RetainedEarningsAccumulatedDeficit', '//*[contains(local-name(), "MembersEquityIncludingPortionAttributableToNoncontrollingInterest")]', '//us-gaap:CapitalizationLongtermDebtAndEquity', '//*[local-name()="TotalCapitalization"]' ]) self.add_xpaths('cash', [ '//us-gaap:CashCashEquivalentsAndFederalFundsSold', '//us-gaap:CashAndDueFromBanks', '//us-gaap:CashAndCashEquivalentsAtCarryingValue', '//us-gaap:Cash', '//*[local-name()="CashAndCashEquivalents"]', '//*[contains(local-name(), "CarryingValueOfCashAndCashEquivalents")]', '//*[contains(local-name(), "CashCashEquivalents")]', '//*[contains(local-name(), "CashAndCashEquivalents")]' ]) self.add_xpaths('cash_flow_op', [ '//us-gaap:NetCashProvidedByUsedInOperatingActivities', '//us-gaap:NetCashProvidedByUsedInOperatingActivitiesContinuingOperations' ]) self.add_xpaths('cash_flow_inv', [ '//us-gaap:NetCashProvidedByUsedInInvestingActivities', '//us-gaap:NetCashProvidedByUsedInInvestingActivitiesContinuingOperations' ]) self.add_xpaths('cash_flow_fin', [ '//us-gaap:NetCashProvidedByUsedInFinancingActivities', '//us-gaap:NetCashProvidedByUsedInFinancingActivitiesContinuingOperations' ]) def _get_symbol(self): try: filename = self.context['response'].url.split('/')[-1] print("filename: " + filename) if filename == "a0330201910qdocument_htm.xml": # hard code for the report return "INTC" else: return filename.split('-')[0].upper() except IndexError: return None def _get_doc_fiscal_year(self): try: fiscal_year = self.selector.xpath( '//dei:DocumentFiscalYearFocus/text()')[0].extract() return int(fiscal_year) except (IndexError, ValueError): return None def _guess_fiscal_year(self, end_date, period_focus): # Guess fiscal_year based on document end_date and period_focus date = datetime.strptime(end_date, DATE_FORMAT) month_ranges = { 'Q1': (2, 3, 4), 'Q2': (5, 6, 7), 'Q3': (8, 9, 10), 'FY': (11, 12, 1) } month_range = month_ranges.get(period_focus) # Case 1: release Q1 around March, Q2 around June, ... # This is what most companies do if date.month in month_range: if period_focus == 'FY' and date.month == 1: return date.year - 1 return date.year # How many days left before 10-K's release? days_left_table = {'Q1': 270, 'Q2': 180, 'Q3': 90, 'FY': 0} days_left = days_left_table.get(period_focus) # Other cases, assume end_date.year of its FY report equals to # its fiscal_year if days_left is not None: fy_date = date + timedelta(days=days_left) return fy_date.year return None def _get_doc_end_date(self): # the document end date could come from URL or document content # we need to guess which one is correct url_str = self.context['response'].url print(url_str) if "htm" in url_str: URLDateValid = False else: URLDateValid = True url_date_str = url_str.split('-')[-1].split('.')[0] url_date = datetime.strptime(url_date_str, '%Y%m%d') url_date_str = url_date.strftime(DATE_FORMAT) print("Date from URL" + url_date_str) DocDateValid = True try: doc_date_str = self.selector.xpath( '//dei:DocumentPeriodEndDate/text()')[0].extract() print("Date from doc" + doc_date_str) doc_date = datetime.strptime(doc_date_str, DATE_FORMAT) except (IndexError, ValueError): DocDateValid = False #return url_date.strftime(DATE_FORMAT) context_date_strs = set( self.selector.xpath( '//*[local-name()="context"]//*[local-name()="endDate"]/text()' ).extract()) if DocDateValid: if doc_date_str in context_date_strs: date = doc_date else: DocDateValid = False if DocDateValid: date = doc_date elif URLDateValid: date = url_date else: print("No peirod end date is found!!") return date.strftime(DATE_FORMAT) def _get_doc_type(self): try: return self.selector.xpath( '//dei:DocumentType/text()')[0].extract().upper() except (IndexError, ValueError): return None def _get_period_focus(self, doc_end_date): try: return self.selector.xpath('//dei:DocumentFiscalPeriodFocus/text()' )[0].extract().strip().upper() except IndexError: pass try: doc_yr = doc_end_date.split('-')[0] yr_end_date = self.selector.xpath( '//dei:CurrentFiscalYearEndDate/text()')[0].extract() yr_end_date = yr_end_date.replace('--', doc_yr + '-') except IndexError: return None doc_end_date = datetime.strptime(doc_end_date, '%Y-%m-%d') yr_end_date = datetime.strptime(yr_end_date, '%Y-%m-%d') delta_days = (yr_end_date - doc_end_date).days if delta_days > -45 and delta_days < 45: return 'FY' elif (delta_days <= -45 and delta_days > -135) or delta_days > 225: return 'Q1' elif (delta_days <= -135 and delta_days > -225) or (delta_days > 135 and delta_days <= 225): return 'Q2' elif delta_days <= -225 or (delta_days > 45 and delta_days <= 135): return 'Q3' return 'FY'
class ChildItemLoader(TestItemLoader): name_in = MapCompose(TestItemLoader.name_in, unicode.swapcase)