def parse(self, response): hxs = HtmlXPathSelector(response) rows = hxs.select("/html/body/div/div/div/table/table/tr") # First add all the keywords on this page. for tr in rows: for td in tr.select("td"): keyword = "".join(td.select("p//text()").extract()) keyword = keyword.split(",")[0].split("/")[0].split("(")[0] keyword = keyword.strip() """ Note that for the word "f", TDK dictionary web page has a problem and it adds FF0000> to the keyword. So "fabrikacilik" becomes u'FF0000">fabrikac\u0131l\u0131k' so you might want to use: keyword = keyword.replace('FF0000">', "") OR do this on SQL: UPDATE entry_entry SET keyword=replace(keyword, 'FF0000">', ''), normalized=replace(normalized, 'FF0000">', '') WHERE keyword LIKE 'FF%'; """ try: Entry.objects.create(keyword=keyword, normalized=normalize(keyword)) except IntegrityError: # Pass on when we get an IntegrityError. print "Got IntegrityError on: %s" % keyword # entry_item = EntryItem(keyword=keyword) # entry_item.save() # Next, add the next page to URLs to crawl. if len(rows) != 0: # add next page link to pages to crawl as well. next_page_xpath = "/html/body/div/div/div/table/tr/td/form/p/span[2]/a/@href" path = hxs.select(next_page_xpath).extract()[0] yield Request(self.domain + path, callback=self.parse)
def find_meaning(keyword): """ Returns a JSON string representing keyword and its meanings according to the dictionary specified. """ try: entry = Entry.objects.get(keyword=keyword) except Entry.DoesNotExist: try: entry = Entry.objects.filter(normalized=normalize(keyword))[0] except IndexError: raise Http404 cursor = connection.cursor() """ Custom SQL using some advanced functions such as array_agg and row_to_json which returns JSON data. """ cursor.execute( "SELECT array_to_json(array_agg(row_to_json(t1))) " "FROM (" "SELECT m.id, m.tags, content, example FROM entry_meaning as m " "WHERE m.entry_id=%s ORDER BY id ASC) t1", [entry.id], ) """ Meaning is a dict like: {u'meaning': [{u'content': u'aslen cinceden gecmis tum dunya dillerine', u'id': 1, u'tags': [u'isim', u'cince']}, {u'content': u'tropik firtina', u'id': 2, u'tags': [u'isim', u'ingilizce']}], u'keyword': u'tayfun'} """ result = cursor.fetchone()[0] if result: entry_dict = model_to_dict(entry, ["keyword", "extra_info", "tags"]) entry_dict["meaning"] = result return entry_dict else: return None
def parse(self, response): hxs = HtmlXPathSelector(response) tr_list = hxs.select("/html/body/div/div/div/table/tr")[3].select( "td/table[@id='hor-minimalist-a']") entry = None for meaning_tr in tr_list: # Keyword is not needed here because we already have correct # keywords crawled from official website. keyword = "".join(meaning_tr.select( "thead/tr/th/b//text()").extract()).split("(")[0].strip() if not entry: try: entry = Entry.objects.get(keyword=keyword) except Entry.DoesNotExist: entry = Entry.objects.get(normalized=normalize(keyword)) entry.keyword = keyword tags = "".join(meaning_tr.select( "thead/tr/th/i/b//text()").extract()).strip() if tags: tags = [tag.strip() for tag in tags.split(",")] extra_info = "".join(meaning_tr.select( "thead/tr/th/i/text()").extract()).strip() if extra_info: extra_info = [extra.strip() for extra in extra_info.split(",")] if tags: if not entry.tags: entry.tags = tags else: entry.tags.extend(tags) entry.tags = list(set(entry.tags)) if extra_info: if not entry.extra_info: entry.extra_info = extra_info else: entry.extra_info.extend(extra_info) entry.extra_info = list(set(entry.extra_info)) if not entry.normalized: entry.normalized = normalize(keyword.lower()) entry.save() for meaning in meaning_tr.select("tr"): tags = "".join(meaning.select("td")[0].select("i")[0].select( "text()").extract()).strip() if tags: tags = [tag.strip() for tag in tags.split(",")] else: tags = None meaning_text = "".join(meaning.select("td")[0].select( "text()").extract()) meaning_text = self.meaning_start_re.sub("", meaning_text) meaning_text = self.meaning_end_re.sub("", meaning_text) try: example = "".join(meaning.select("td")[0].select( "i")[1].select("text()").extract()) except IndexError: example = None if example: source = "".join(meaning.select("td")[0].select( "b/text()").extract()).strip() if source: example = example + " - " + source Meaning.objects.create(entry=entry, tags=tags, content=meaning_text, example=example)