Example #1
0
 def parse(self, response):
     hxs = HtmlXPathSelector(response)
     rows = hxs.select("/html/body/div/div/div/table/table/tr")
     # First add all the keywords on this page.
     for tr in rows:
         for td in tr.select("td"):
             keyword = "".join(td.select("p//text()").extract())
             keyword = keyword.split(",")[0].split("/")[0].split("(")[0]
             keyword = keyword.strip()
             """
             Note that for the word "f", TDK dictionary web page
             has a problem and it adds FF0000> to the keyword. So
             "fabrikacilik" becomes u'FF0000">fabrikac\u0131l\u0131k'
             so you might want to use:
                 keyword = keyword.replace('FF0000">', "")
             OR do this on SQL:
                 UPDATE entry_entry SET
                 keyword=replace(keyword, 'FF0000">', ''),
                 normalized=replace(normalized, 'FF0000">', '')
                 WHERE keyword LIKE 'FF%';
             """
             try:
                 Entry.objects.create(keyword=keyword, normalized=normalize(keyword))
             except IntegrityError:
                 # Pass on when we get an IntegrityError.
                 print "Got IntegrityError on: %s" % keyword
             # entry_item = EntryItem(keyword=keyword)
             # entry_item.save()
     # Next, add the next page to URLs to crawl.
     if len(rows) != 0:
         # add next page link to pages to crawl as well.
         next_page_xpath = "/html/body/div/div/div/table/tr/td/form/p/span[2]/a/@href"
         path = hxs.select(next_page_xpath).extract()[0]
         yield Request(self.domain + path, callback=self.parse)
Example #2
0
def find_meaning(keyword):
    """
    Returns a JSON string representing keyword and its meanings according to
    the dictionary specified.
    """
    try:
        entry = Entry.objects.get(keyword=keyword)
    except Entry.DoesNotExist:
        try:
            entry = Entry.objects.filter(normalized=normalize(keyword))[0]
        except IndexError:
            raise Http404

    cursor = connection.cursor()

    """
    Custom SQL using some advanced functions such as array_agg and
    row_to_json which returns JSON data.
    """
    cursor.execute(
        "SELECT array_to_json(array_agg(row_to_json(t1))) "
        "FROM ("
        "SELECT m.id, m.tags, content, example FROM entry_meaning as m "
        "WHERE m.entry_id=%s ORDER BY id ASC) t1",
        [entry.id],
    )
    """
    Meaning is a dict like:

{u'meaning': [{u'content': u'aslen cinceden gecmis tum dunya dillerine',
   u'id': 1,
   u'tags': [u'isim', u'cince']},
  {u'content': u'tropik firtina', u'id': 2, u'tags': [u'isim', u'ingilizce']}],
 u'keyword': u'tayfun'}
        """
    result = cursor.fetchone()[0]
    if result:
        entry_dict = model_to_dict(entry, ["keyword", "extra_info", "tags"])
        entry_dict["meaning"] = result
        return entry_dict
    else:
        return None
    def parse(self, response):
        hxs = HtmlXPathSelector(response)
        tr_list = hxs.select("/html/body/div/div/div/table/tr")[3].select(
            "td/table[@id='hor-minimalist-a']")
        entry = None
        for meaning_tr in tr_list:
            # Keyword is not needed here because we already have correct
            # keywords crawled from official website.
            keyword = "".join(meaning_tr.select(
                "thead/tr/th/b//text()").extract()).split("(")[0].strip()

            if not entry:
                try:
                    entry = Entry.objects.get(keyword=keyword)
                except Entry.DoesNotExist:
                    entry = Entry.objects.get(normalized=normalize(keyword))
                    entry.keyword = keyword

            tags = "".join(meaning_tr.select(
                "thead/tr/th/i/b//text()").extract()).strip()
            if tags:
                tags = [tag.strip() for tag in tags.split(",")]

            extra_info = "".join(meaning_tr.select(
                "thead/tr/th/i/text()").extract()).strip()
            if extra_info:
                extra_info = [extra.strip() for extra in extra_info.split(",")]

            if tags:
                if not entry.tags:
                    entry.tags = tags
                else:
                    entry.tags.extend(tags)
                    entry.tags = list(set(entry.tags))

            if extra_info:
                if not entry.extra_info:
                    entry.extra_info = extra_info
                else:
                    entry.extra_info.extend(extra_info)
                    entry.extra_info = list(set(entry.extra_info))
            if not entry.normalized:
                entry.normalized = normalize(keyword.lower())
            entry.save()
            for meaning in meaning_tr.select("tr"):
                tags = "".join(meaning.select("td")[0].select("i")[0].select(
                        "text()").extract()).strip()
                if tags:
                    tags = [tag.strip() for tag in tags.split(",")]
                else:
                    tags = None
                meaning_text = "".join(meaning.select("td")[0].select(
                    "text()").extract())
                meaning_text = self.meaning_start_re.sub("", meaning_text)
                meaning_text = self.meaning_end_re.sub("", meaning_text)
                try:
                    example = "".join(meaning.select("td")[0].select(
                        "i")[1].select("text()").extract())
                except IndexError:
                    example = None
                if example:
                    source = "".join(meaning.select("td")[0].select(
                        "b/text()").extract()).strip()
                    if source:
                        example = example + " - " + source
                Meaning.objects.create(entry=entry, tags=tags,
                                      content=meaning_text, example=example)