Beispiel #1
0
 def run(self, doPrint=False):
     self.converter.writer.writeheader()
     rowTags = self.getParsedHTML(convertEntities=False).findAll("tr")
     for tr in rowTags:
         cells = tr.findAll("td")
         if len(cells) == 0:
             continue
         # skip the first cell, which is page numbers from Pokorny's PIE
         # dictionary
         terms, seeAlsos, definition = cells[1:]
         pieTerms = []
         for term in terms.findAll("span"):
             pieTerms.append(term.text)
             pieTerms.extend(utils.getWordPermutations(term.text))
         # we're going to add these to the keywords too
         pieSeeAlsos = [x.text for x in seeAlsos.findAll("span")]
         # clean up definitions text
         self.stripTags(definition)
         # put it all together
         for pie in pieTerms:
             pieStems = self.getStems(pie, withUnicode=True)
             [
                 pieStems.extend(self.getStems(x, withUnicode=True))
                 for x in pieSeeAlsos
             ]
             eng = unicode(self.fixUp(str(definition)).decode("utf-8"))
             engStems = self.getStems(eng)
             try:
                 piePhones = utils.getMetaphones(
                     set(pieStems + pieTerms + pieSeeAlsos))
                 engPhones = utils.getMetaphones(engStems)
             except Exception, err:
                 import pdb
                 pdb.set_trace()
             row = {
                 self.converter.fields[0]: pie,
                 self.converter.fields[1]: eng,
                 self.converter.fields[2]: json.dumps(pieSeeAlsos),
                 self.converter.fields[3]: json.dumps(pieStems),
                 self.converter.fields[4]: json.dumps(engStems),
                 self.converter.fields[5]: json.dumps(piePhones),
                 self.converter.fields[6]: json.dumps(engPhones),
             }
             try:
                 self.converter.writer.writerow(row)
             except Exception, err:
                 import pdb
                 pdb.set_trace()
Beispiel #2
0
 def run(self, doPrint=False):
     self.converter.writer.writeheader()
     rowTags = self.getParsedHTML(convertEntities=False).findAll("tr")
     for tr in rowTags:
         cells = tr.findAll("td")
         if len(cells) == 0:
             continue
         # skip the first cell, which is page numbers from Pokorny's PIE
         # dictionary
         terms, seeAlsos, definition = cells[1:]
         pieTerms = []
         for term in terms.findAll("span"):
             pieTerms.append(term.text)
             pieTerms.extend(utils.getWordPermutations(term.text))
         # we're going to add these to the keywords too
         pieSeeAlsos = [x.text for x in seeAlsos.findAll("span")]
         # clean up definitions text
         self.stripTags(definition)
         # put it all together
         for pie in pieTerms:
             pieStems = self.getStems(pie, withUnicode=True)
             [pieStems.extend(self.getStems(x, withUnicode=True))
              for x in pieSeeAlsos]
             eng = unicode(self.fixUp(str(definition)).decode("utf-8"))
             engStems = self.getStems(eng)
             try:
                 piePhones = utils.getMetaphones(
                     set(pieStems + pieTerms + pieSeeAlsos))
                 engPhones = utils.getMetaphones(engStems)
             except Exception, err:
                 import pdb;pdb.set_trace()
             row = {
                 self.converter.fields[0]: pie,
                 self.converter.fields[1]: eng,
                 self.converter.fields[2]: json.dumps(pieSeeAlsos),
                 self.converter.fields[3]: json.dumps(pieStems),
                 self.converter.fields[4]: json.dumps(engStems),
                 self.converter.fields[5]: json.dumps(piePhones),
                 self.converter.fields[6]: json.dumps(engPhones),
                 }
             try:
                 self.converter.writer.writerow(row)
             except Exception, err:
                 import pdb
                 pdb.set_trace()
Beispiel #3
0
 def splitPermutations(self, field1, field2):
     permutations = utils.getWordPermutations(field1)
     output = ""
     for word in permutations:
         output += self.converter.formatRow(word, field2)
     return output
Beispiel #4
0
 def test_getPermutationsInitialUnicode(self):
     word = "*(s)tanā-"
     result = utils.getWordPermutations(word)
     expected = ['*tan\xc4\x81-', '*stan\xc4\x81-']
     self.assertEqual(result, expected)
Beispiel #5
0
 def test_getPermutationsInitial(self):
     word = "*(s)tano"
     result = utils.getWordPermutations(word)
     expected = ['*tano', '*stano']
     self.assertEqual(result, expected)
Beispiel #6
0
 def splitPermutations(self, field1, field2):
     permutations = utils.getWordPermutations(field1)
     output = ""
     for word in permutations:
         output += self.converter.formatRow(word, field2)
     return output