def parse(self, response): l = ItemLoader(item=FinanceItem(), response=response) l.add_xpath("CompanyName", '//*[@id="companyheader"]/div[1]/h3/text()', MapCompose(unicode.strip, unicode.title)) #needs return value to output l.add_xpath("StockExchangeAndCode", '//*[@id="companyheader"]/div[1]/text()[1]', MapCompose(unicode.strip, unicode.title)) l.add_xpath("Currency", '//*[@id="ref_6826782_elt"]/div/div/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath("StockPrice", '//*[@id="ref_6826782_l"]/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath( "MarketCap", '//*[@id="market-data-div"]/div[2]/div[1]/table[1]/tr[5]/td[2]/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath( "PE", '//*[@id="market-data-div"]/div[2]/div[1]/table[1]/tr[6]/td[2]/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath( "EPS", '//*[@id="market-data-div"]/div[2]/div[1]/table[2]/tr[2]/td[2]/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath( "Shares", '//*[@id="market-data-div"]/div[2]/div[1]/table[2]/tr[3]/td[2]/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath( "InstOwn", '//*[@id="market-data-div"]/div[2]/div[1]/table[2]/tr[5]/td[2]/text()', MapCompose(unicode.strip, unicode.title)) return l.load_item()
def parse(self, response): sel = HtmlXPathSelector(response) torrent = FinanceItem() result = sel.select( "//table[@class='yfnc_modtitle1'][1]/following::p/text()").extract( ) torrent['description'] = str(result[0].split('.')[0]) torrent['description'] += result[0].split('.')[1] self.risultato = torrent['description'] return torrent
def parse_item(self, response): item = FinanceItem() text = response.xpath('//div[@class="article article_16"]/p/text()').extract() content = [] for t in text: content.append(["p", t]) item['content'] = content item['source'] = 'sina' item['datetime'] = response.xpath('//div[@class="page-info"]/span[@class="time-source"]/text()').extract()#[0] item['title'] = response.xpath("/html/head/title/text()").extract()[0] item['href'] = response.url item['type'] = 'sina' yield item
def parse_item(self, response): item = FinanceItem() text = response.xpath('//div[@class="Body"]/p/text()').extract() content = [] for t in text: content.append(["p", t]) item['content'] = content item['source'] = 'eastmoney' item['datetime'] = response.xpath('//div[@class="time"]/text()').extract()[0] item['title'] = response.xpath('//div[@class="newsContent"]/h1/text()').extract()[0] item['href'] = response.url item['type'] = 'eastmoney' yield item
def parse_item(self, response): item = FinanceItem() text = response.xpath('//div[@class="Cnt-Main-Article-QQ"]/p/text()').extract() content = [] for t in text: content.append(["p", t]) item['content'] = content item['source'] = 'qq' item['datetime'] = response.xpath('//div[@class="a_Info"]/span[@class="a_time"]/text()').extract()[0] item['title'] = response.xpath("/html/head/title/text()").extract()[0] item['href'] = response.url item['type'] = 'qq' return item
def parse_item(self, response): item = FinanceItem() text = response.xpath('//div[@class="content"]/p/text()').extract() content = [] for t in text: content.append(["p", t]) item['content'] = content item['source'] = 'cnstock' item['datetime'] = response.xpath( '//div[@class="bullet"]/span[@class="timer"]/text()').extract()[0] item['title'] = response.xpath("/html/head/title/text()").extract()[0] item['href'] = response.url item['type'] = 'cnstock' yield item
def parse(self, response): l = ItemLoader(item=FinanceItem(), response=response) l.add_xpath("CompanyName", '//*[@id="companyheader"]/div[1]/h3/text()', MapCompose(unicode.strip, unicode.title)) # needs return value to output l.add_xpath("StockExchangeAndCode", '//*[@id="companyheader"]/div[1]/text()[1]', MapCompose(unicode.strip, unicode.title)) l.add_xpath("NetProfitMargin", '//*[@id="gf-viewc"]/div/div/div[3]/div[1]/div/div[4]/table/tr[1]/td[3]/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath("OperatingMargin", '//*[@id="gf-viewc"]/div/div/div[3]/div[1]/div/div[4]/table/tr[2]/td[3]/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath("EBITDMargin", '//*[@id="gf-viewc"]/div/div/div[3]/div[1]/div/div[4]/table/tr[3]/td[3]', MapCompose(unicode.strip, unicode.title)) l.add_xpath("ReturnOnAssets", '//*[@id="gf-viewc"]/div/div/div[3]/div[1]/div/div[4]/table/tr[4]/td[3]/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath("ReturnOnEquity", '//*[@id="gf-viewc"]/div/div/div[3]/div[1]/div/div[4]/table/tr[5]/td[3]/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath("Employees", '//*[@id="gf-viewc"]/div/div/div[3]/div[1]/div/div[4]/table/tr[5]/td[3]/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath("CDPScore", '//*[@id="gf-viewc"]/div/div/div[3]/div[1]/div/div[4]/table/tr[7]/td[3]', MapCompose(unicode.strip, unicode.title)) return l.load_item()
def parse_item(self, response): print response.url, '---------------------' item = FinanceItem() text = response.xpath('//p/text()').extract() content = [] for t in text: content.append(["p", t]) item['content'] = content item['source'] = '10jqka' item['datetime'] = response.xpath( '//div[@class="date"]/span/text()').extract()[0][:19] item['title'] = response.xpath("/html/head/title/text()").extract()[0] item['href'] = response.url item['type'] = '10jqka' yield item
def parse(self, response): l = ItemLoader(item=FinanceItem(), response=response) l.add_xpath( "Currency", '//div[@id="incinterimdiv"]//*[@id="fs-table"]/thead/tr/th[1]/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath( "TimePeriod", '//div[@id="incinterimdiv"]//*[@id="fs-table"]/thead/tr/th[2]/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath( "Revenue", '//div[@id="incinterimdiv"]//*[@id="fs-table"]/tbody/tr[1]/td[2]/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath( "OtherRevenueTotal", '//div[@id="incinterimdiv"]//*[@id="fs-table"]/tbody/tr[2]/td[2]/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath( "TotalRevenue", '//div[@id="incinterimdiv"]//*[@id="fs-table"]/tbody/tr[3]/td[2]/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath( "CostOfRevenueTotal", '//div[@id="incinterimdiv"]//*[@id="fs-table"]/tbody/tr[4]/td[2]/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath( "GrossProfit", '//div[@id="incinterimdiv"]//*[@id="fs-table"]/tbody/tr[5]/td[2]/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath( "SellingGeneralAdminExpensesTotal", '//div[@id="incinterimdiv"]//*[@id="fs-table"]/tbody/tr[6]/td[2]/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath( "ResearchAndDevelopment", '//div[@id="incinterimdiv"]//*[@id="fs-table"]/tbody/tr[7]/td[2]/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath( "DepreciationAmortization", '//div[@id="incinterimdiv"]//*[@id="fs-table"]/tbody/tr[8]/td[2]/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath( "InterestExpenseIncome", '//div[@id="incinterimdiv"]//*[@id="fs-table"]/tbody/tr[9]/td[2]/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath( "UnusualExpenseIncome", '//div[@id="incinterimdiv"]//*[@id="fs-table"]/tbody/tr[10]/td[2]/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath( "OtherOperatingExpenses", '//div[@id="incinterimdiv"]//*[@id="fs-table"]/tbody/tr[11]/td[2]/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath( "TotalOperatingExpenses", '//div[@id="incinterimdiv"]//*[@id="fs-table"]/tbody/tr[12]/td[2]/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath( "OperatingIncome", '//div[@id="incinterimdiv"]//*[@id="fs-table"]/tbody/tr[13]/td[2]/span/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath( "InterestIncomeExpense", '//div[@id="incinterimdiv"]//*[@id="fs-table"]/tbody/tr[14]/td[2]/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath( "GainLossOnSaleOfAssets", '//div[@id="incinterimdiv"]//*[@id="fs-table"]/tbody/tr[15]/td[2]/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath( "OtherNet", '//div[@id="incinterimdiv"]//*[@id="fs-table"]/tbody/tr[16]/td[2]/span/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath( "IncomeBeforeTax", '//div[@id="incinterimdiv"]//*[@id="fs-table"]/tbody/tr[17]/td[2]/span/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath( "IncomeAfterTax", '//div[@id="incinterimdiv"]//*[@id="fs-table"]/tbody/tr[18]/td[2]/span/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath( "MinorityInterest", '//div[@id="incinterimdiv"]//*[@id="fs-table"]/tbody/tr[19]/td[2]/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath( "EquityInAffiliates", '//div[@id="incinterimdiv"]//*[@id="fs-table"]/tbody/tr[20]/td[2]/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath( "NetIncomeBeforeExtraItems", '//div[@id="incinterimdiv"]//*[@id="fs-table"]/tbody/tr[21]/td[2]/span/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath( "AccountingChange", '//div[@id="incinterimdiv"]//*[@id="fs-table"]/tbody/tr[22]/td[2]/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath( "DiscontinuedOperations", '//div[@id="incinterimdiv"]//*[@id="fs-table"]/tbody/tr[23]/td[2]/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath( "ExtraordinaryItem", '//div[@id="incinterimdiv"]//*[@id="fs-table"]/tbody/tr[24]/td[2]/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath( "NetIncome", '//div[@id="incinterimdiv"]//*[@id="fs-table"]/tbody/tr[25]/td[2]/span/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath( "PreferredDividends", '//div[@id="incinterimdiv"]//*[@id="fs-table"]/tbody/tr[26]/td[2]/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath( "IncomeAvailabletoCommonExclExtraItems", '//div[@id="incinterimdiv"]//*[@id="fs-table"]/tbody/tr[27]/td[2]/span/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath( "IncomeAvailabletoCommonInclExtraItems", '//div[@id="incinterimdiv"]//*[@id="fs-table"]/tbody/tr[28]/td[2]/span/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath( "BasicWeightedAverageShares", '//div[@id="incinterimdiv"]//*[@id="fs-table"]/tbody/tr[29]/td[2]/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath( "BasicEPSExcludingExtraordinaryItems", '//div[@id="incinterimdiv"]//*[@id="fs-table"]/tbody/tr[30]/td[2]/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath( "BasicEPSIncludingExtraordinaryItems", '//div[@id="incinterimdiv"]//*[@id="fs-table"]/tbody/tr[31]/td[2]/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath( "DilutionAdjustment", '//div[@id="incinterimdiv"]//*[@id="fs-table"]/tbody/tr[32]/td[2]/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath( "DilutedWeightedAverageShares", '//div[@id="incinterimdiv"]//*[@id="fs-table"]/tbody/tr[33]/td[2]/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath( "DilutedEPSExcludingExtraordinaryItems", '//div[@id="incinterimdiv"]//*[@id="fs-table"]/tbody/tr[34]/td[2]/span/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath( "DilutedEPSIncludingExtraordinaryItems", '//div[@id="incinterimdiv"]//*[@id="fs-table"]/tbody/tr[35]/td[2]/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath( "DividendsperShareCommonStockPrimaryIssue", '//div[@id="incinterimdiv"]//*[@id="fs-table"]/tbody/tr[36]/td[2]/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath( "GrossDividendsCommonStock", '//div[@id="incinterimdiv"]//*[@id="fs-table"]/tbody/tr[37]/td[2]/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath( "NetIncomeAfterStockBasedCompExpense", '//div[@id="incinterimdiv"]//*[@id="fs-table"]/tbody/tr[38]/td[2]/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath( "BasicEPSAfterStockBasedCompExpense", '//div[@id="incinterimdiv"]//*[@id="fs-table"]/tbody/tr[39]/td[2]/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath( "DilutedEPSAfterStockBasedCompExpense", '//div[@id="incinterimdiv"]//*[@id="fs-table"]/tbody/tr[40]/td[2]/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath( "DepreciationSupplemental", '//div[@id="incinterimdiv"]//*[@id="fs-table"]/tbody/tr[41]/td[2]/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath( "TotalSpecialItems", '//div[@id="incinterimdiv"]//*[@id="fs-table"]/tbody/tr[42]/td[2]/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath( "NormalizedIncomeBeforeTaxes", '//div[@id="incinterimdiv"]//*[@id="fs-table"]/tbody/tr[43]/td[2]/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath( "EffectOfSpecialItemsOnIncomeTaxes", '//div[@id="incinterimdiv"]//*[@id="fs-table"]/tbody/tr[44]/td[2]/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath( "IncomeTaxesExcludingImpactOfSpecialItems", '//div[@id="incinterimdiv"]//*[@id="fs-table"]/tbody/tr[45]/td[2]/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath( "NormalizedIncomeAfterTaxes", '//div[@id="incinterimdiv"]//*[@id="fs-table"]/tbody/tr[46]/td[2]/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath( "NormalizedIncomeAvailToCommon", '//div[@id="incinterimdiv"]//*[@id="fs-table"]/tbody/tr[47]/td[2]/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath( "BasicNormalizedEPS", '//div[@id="incinterimdiv"]//*[@id="fs-table"]/tbody/tr[48]/td[2]/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath( "DilutedNormalizedEPS", '//div[@id="incinterimdiv"]//*[@id="fs-table"]/tbody/tr[49]/td[2]/span/text()', MapCompose(unicode.strip, unicode.title)) return l.load_item()
def parse(self, response): l = ItemLoader(item=FinanceItem(), response=response) l.add_xpath("Currency", '//div[@id="balannualdiv"]//*[@id="fs-table"]/thead/tr/th[1]/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath("TimePeriod", '//div[@id="balannualdiv"]//*[@id="fs-table"]/thead/tr/th[2]/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath("CashAndEquivalents", '//div[@id="balannualdiv"]//*[@id="fs-table"]/tbody/tr[1]/td[2]/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath("ShortTermInvestments", '//div[@id="balannualdiv"]//*[@id="fs-table"]/tbody/tr[2]/td[2]/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath("CashAndShortTermInvestments", '//div[@id="balannualdiv"]//*[@id="fs-table"]/tbody/tr[3]/td[2]/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath("AccountsReceivableTradeNet", '//div[@id="balannualdiv"]//*[@id="fs-table"]/tbody/tr[4]/td[2]/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath("ReceivablesOther", '//div[@id="balannualdiv"]//*[@id="fs-table"]/tbody/tr[5]/td[2]/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath("TotalReceivablesNet", '//div[@id="balannualdiv"]//*[@id="fs-table"]/tbody/tr[6]/td[2]/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath("TotalInventory", '//div[@id="balannualdiv"]//*[@id="fs-table"]/tbody/tr[7]/td[2]/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath("PrepaidExpenses", '//div[@id="balannualdiv"]//*[@id="fs-table"]/tbody/tr[8]/td[2]/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath("OtherCurrentAssetsTotal", '//div[@id="balannualdiv"]//*[@id="fs-table"]/tbody/tr[9]/td[2]/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath("TotalCurrentAssets", '//div[@id="balannualdiv"]//*[@id="fs-table"]/tbody/tr[10]/td[2]/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath("PropertyPlantEquipmentTotalGross", '//div[@id="balannualdiv"]//*[@id="fs-table"]/tbody/tr[11]/td[2]/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath("AccumulatedDepreciationTotal", '//div[@id="balannualdiv"]//*[@id="fs-table"]/tbody/tr[12]/td[2]/span/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath("GoodwillNet", '//div[@id="balannualdiv"]//*[@id="fs-table"]/tbody/tr[13]/td[2]/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath("IntangiblesNet", '//div[@id="balannualdiv"]//*[@id="fs-table"]/tbody/tr[14]/td[2]/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath("LongTermInvestments", '//div[@id="balannualdiv"]//*[@id="fs-table"]/tbody/tr[15]/td[2]/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath("OtherLongTermAssetsTotal", '//div[@id="balannualdiv"]//*[@id="fs-table"]/tbody/tr[16]/td[2]/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath("TotalAssets", '//div[@id="balannualdiv"]//*[@id="fs-table"]/tbody/tr[17]/td[2]/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath("AccountsPayable", '//div[@id="balannualdiv"]//*[@id="fs-table"]/tbody/tr[18]/td[2]/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath("AccruedExpenses", '//div[@id="balannualdiv"]//*[@id="fs-table"]/tbody/tr[19]/td[2]/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath("NotesPayableShortTermDebt", '//div[@id="balannualdiv"]//*[@id="fs-table"]/tbody/tr[20]/td[2]/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath("CurrentPortofLTDebtCapitalLeases", '//div[@id="balannualdiv"]//*[@id="fs-table"]/tbody/tr[21]/td[2]/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath("OtherCurrentliabilitiesTotal", '//div[@id="balannualdiv"]//*[@id="fs-table"]/tbody/tr[22]/td[2]/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath("TotalCurrentLiabilities", '//div[@id="balannualdiv"]//*[@id="fs-table"]/tbody/tr[23]/td[2]/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath("LongTermDebt", '//div[@id="balannualdiv"]//*[@id="fs-table"]/tbody/tr[24]/td[2]/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath("CapitalLeaseObligations", '//div[@id="balannualdiv"]//*[@id="fs-table"]/tbody/tr[25]/td[2]/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath("TotalLongTermDebt", '//div[@id="balannualdiv"]//*[@id="fs-table"]/tbody/tr[26]/td[2]/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath("TotalDebt", '//div[@id="balannualdiv"]//*[@id="fs-table"]/tbody/tr[27]/td[2]/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath("DeferredIncomeTax", '//div[@id="balannualdiv"]//*[@id="fs-table"]/tbody/tr[28]/td[2]/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath("MinorityInterest", '//div[@id="balannualdiv"]//*[@id="fs-table"]/tbody/tr[29]/td[2]/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath("OtherLiabilitiesTotal", '//div[@id="balannualdiv"]//*[@id="fs-table"]/tbody/tr[30]/td[2]/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath("TotalLiabilities", '//div[@id="balannualdiv"]//*[@id="fs-table"]/tbody/tr[31]/td[2]/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath("RedeemablePreferredStockTotal", '//div[@id="balannualdiv"]//*[@id="fs-table"]/tbody/tr[32]/td[2]/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath("PreferredStockNonRedeemableNet", '//div[@id="balannualdiv"]//*[@id="fs-table"]/tbody/tr[33]/td[2]/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath("CommonStockTotal", '//div[@id="balannualdiv"]//*[@id="fs-table"]/tbody/tr[34]/td[2]/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath("AdditionalPaidInCapital", '//div[@id="balannualdiv"]//*[@id="fs-table"]/tbody/tr[35]/td[2]/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath("RetainedEarningsAccumulatedDeficit", '//div[@id="balannualdiv"]//*[@id="fs-table"]/tbody/tr[36]/td[2]/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath("TreasuryStockCommon", '//div[@id="balannualdiv"]//*[@id="fs-table"]/tbody/tr[37]/td[2]/span/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath("OtherEquityTotal", '//div[@id="balannualdiv"]//*[@id="fs-table"]/tbody/tr[38]/td[2]/span/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath("TotalEquity", '//div[@id="balannualdiv"]//*[@id="fs-table"]/tbody/tr[39]/td[2]/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath("TotalLiabilitiesShareholdersEquity", '//div[@id="balannualdiv"]//*[@id="fs-table"]/tbody/tr[40]/td[2]/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath("SharesOutsCommonStockPrimaryIssue", '//div[@id="balannualdiv"]//*[@id="fs-table"]/tbody/tr[41]/td[2]/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath("TotalCommonSharesOutstanding", '//div[@id="balannualdiv"]//*[@id="fs-table"]/tbody/tr[42]/td[2]/text()', MapCompose(unicode.strip, unicode.title)) return l.load_item()
def parse(self, response): l = ItemLoader(item=FinanceItem(), response=response) l.add_xpath( "Currency", '//div[@id="casinterimdiv"]//*[@id="fs-table"]/thead/tr/th[1]/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath( "TimePeriod", '//div[@id="casinterimdiv"]//*[@id="fs-table"]/thead/tr/th[2]/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath( "NetIncomeStartingLine", '//div[@id="casinterimdiv"]//*[@id="fs-table"]/tbody/tr[1]/td[2]/span/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath( "DepreciationDepreciation", '//div[@id="casinterimdiv"]//*[@id="fs-table"]/tbody/tr[2]/td[2]/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath( "Amortization", '//div[@id="casinterimdiv"]//*[@id="fs-table"]/tbody/tr[3]/td[2]/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath( "DeferredTaxes", '//div[@id="casinterimdiv"]//*[@id="fs-table"]/tbody/tr[4]/td[2]/span/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath( "NonCashItems", '//div[@id="casinterimdiv"]//*[@id="fs-table"]/tbody/tr[5]/td[2]/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath( "ChangesInWorkingCapital", '//div[@id="casinterimdiv"]//*[@id="fs-table"]/tbody/tr[6]/td[2]/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath( "CashFromOperatingActivities", '//div[@id="casinterimdiv"]//*[@id="fs-table"]/tbody/tr[7]/td[2]/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath( "CapitalExpenditures", '//div[@id="casinterimdiv"]//*[@id="fs-table"]/tbody/tr[8]/td[2]/span/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath( "OtherInvestingCashFlowItemsTotal", '//div[@id="casinterimdiv"]//*[@id="fs-table"]/tbody/tr[9]/td[2]/span/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath( "CashFromInvestingActivities", '//div[@id="casinterimdiv"]//*[@id="fs-table"]/tbody/tr[10]/td[2]/span/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath( "FinancingCashFlowItems", '//div[@id="casinterimdiv"]//*[@id="fs-table"]/tbody/tr[11]/td[2]/span/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath( "TotalCashDividendsPaid", '//div[@id="casinterimdiv"]//*[@id="fs-table"]/tbody/tr[12]/td[2]/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath( "IssuanceRetirementofStockNet", '//div[@id="casinterimdiv"]//*[@id="fs-table"]/tbody/tr[13]/td[2]/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath( "IssuanceRetitementOfDebtNet", '//div[@id="casinterimdiv"]//*[@id="fs-table"]/tbody/tr[14]/td[2]/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath( "CashFromFinancingActivities", '//div[@id="casinterimdiv"]//*[@id="fs-table"]/tbody/tr[15]/td[2]/span/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath( "ForeignExchangeEffects", '//div[@id="casinterimdiv"]//*[@id="fs-table"]/tbody/tr[16]/td[2]/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath( "NetChangeInCash", '//div[@id="casinterimdiv"]//*[@id="fs-table"]/tbody/tr[17]/td[2]/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath( "CashInterestPaidSupplemental", '//div[@id="casinterimdiv"]//*[@id="fs-table"]/tbody/tr[18]/td[2]/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath( "CashTaxesPaidSupplemental", '//div[@id="casinterimdiv"]//*[@id="fs-table"]/tbody/tr[19]/td[2]/text()', MapCompose(unicode.strip, unicode.title)) return l.load_item()
def parse(self, response): pdf = PyPDF2.PdfFileReader(io.BytesIO(response.body)) pages = pdf.getNumPages() # start = end = 0 # output = [] date_created = None for page in range(pages): text = pdf.getPage(page).extractText() if not date_created: if re.search('\(as of (\d+/\d+/\d+)[ ]*\)', text): date_created = re.search('\(as of (\d+/\d+/\d+)[ ]*\)', text).group(1) else: break # for n, string in enumerate(text.split('\n')): # if '$ total' in string or 'No estimated extra taxable distribution required' in string: # start = n + 1 # if ' ' in string or '(over)' in string: # end = n # break # text = text.split('\n')[start:end] # cleaned_text = [] # for string in text: # if '%' in string: # cleaned_text.extend([s.strip('$') for s in string.split('%')]) # else: # cleaned_text.append(string.strip('$')) # # for n in range(0, len(cleaned_text), 11): # output.append(cleaned_text[n:n+11]) output = [] tables = tabula.read_pdf(io.BytesIO(response.body), pages="all", multiple_tables=True, stream=True) for table in tables: if len(table) < 1: continue for ix in table.index: output_row = [] row = table.loc[ix] for n, cell in enumerate(row): if n == 0: if (pd.isna(cell) or cell == 'Putnam fund name'): break else: output_row.append(cell) else: for s in cell.split(): output_row.append(s.strip('$%')) if output_row: output.append(output_row) for row in output: if row[0].startswith(':'): row[0] = row[0][1:] item = FinanceItem() item['firm_name'] = 'Putnam' item['fund_name'] = row[0] item['ex_date'] = row[1] item['pay_date'] = row[2] per_share_total = float(row[-1]) item['short_term_gain'] = float(row[-4]) try: item['short_term_gain_pct'] = item[ 'short_term_gain'] / per_share_total except ZeroDivisionError: item['short_term_gain_pct'] = 0 item['long_term_gain'] = float(row[-3]) try: item['long_term_gain_pct'] = item[ 'long_term_gain'] / per_share_total except: item['long_term_gain_pct'] = 0 item['record_date'] = date_created item['source_url'] = response.url yield item