def run(self, doPrint=False): self.converter.writer.writeheader() rowTags = self.getParsedHTML(convertEntities=False).findAll("tr") for tr in rowTags: cells = tr.findAll("td") if len(cells) == 0: continue # skip the first cell, which is page numbers from Pokorny's PIE # dictionary terms, seeAlsos, definition = cells[1:] pieTerms = [] for term in terms.findAll("span"): pieTerms.append(term.text) pieTerms.extend(utils.getWordPermutations(term.text)) # we're going to add these to the keywords too pieSeeAlsos = [x.text for x in seeAlsos.findAll("span")] # clean up definitions text self.stripTags(definition) # put it all together for pie in pieTerms: pieStems = self.getStems(pie, withUnicode=True) [ pieStems.extend(self.getStems(x, withUnicode=True)) for x in pieSeeAlsos ] eng = unicode(self.fixUp(str(definition)).decode("utf-8")) engStems = self.getStems(eng) try: piePhones = utils.getMetaphones( set(pieStems + pieTerms + pieSeeAlsos)) engPhones = utils.getMetaphones(engStems) except Exception, err: import pdb pdb.set_trace() row = { self.converter.fields[0]: pie, self.converter.fields[1]: eng, self.converter.fields[2]: json.dumps(pieSeeAlsos), self.converter.fields[3]: json.dumps(pieStems), self.converter.fields[4]: json.dumps(engStems), self.converter.fields[5]: json.dumps(piePhones), self.converter.fields[6]: json.dumps(engPhones), } try: self.converter.writer.writerow(row) except Exception, err: import pdb pdb.set_trace()
def run(self, doPrint=False): self.converter.writer.writeheader() rowTags = self.getParsedHTML(convertEntities=False).findAll("tr") for tr in rowTags: cells = tr.findAll("td") if len(cells) == 0: continue # skip the first cell, which is page numbers from Pokorny's PIE # dictionary terms, seeAlsos, definition = cells[1:] pieTerms = [] for term in terms.findAll("span"): pieTerms.append(term.text) pieTerms.extend(utils.getWordPermutations(term.text)) # we're going to add these to the keywords too pieSeeAlsos = [x.text for x in seeAlsos.findAll("span")] # clean up definitions text self.stripTags(definition) # put it all together for pie in pieTerms: pieStems = self.getStems(pie, withUnicode=True) [pieStems.extend(self.getStems(x, withUnicode=True)) for x in pieSeeAlsos] eng = unicode(self.fixUp(str(definition)).decode("utf-8")) engStems = self.getStems(eng) try: piePhones = utils.getMetaphones( set(pieStems + pieTerms + pieSeeAlsos)) engPhones = utils.getMetaphones(engStems) except Exception, err: import pdb;pdb.set_trace() row = { self.converter.fields[0]: pie, self.converter.fields[1]: eng, self.converter.fields[2]: json.dumps(pieSeeAlsos), self.converter.fields[3]: json.dumps(pieStems), self.converter.fields[4]: json.dumps(engStems), self.converter.fields[5]: json.dumps(piePhones), self.converter.fields[6]: json.dumps(engPhones), } try: self.converter.writer.writerow(row) except Exception, err: import pdb pdb.set_trace()
def splitPermutations(self, field1, field2): permutations = utils.getWordPermutations(field1) output = "" for word in permutations: output += self.converter.formatRow(word, field2) return output
def test_getPermutationsInitialUnicode(self): word = "*(s)tanΔ-" result = utils.getWordPermutations(word) expected = ['*tan\xc4\x81-', '*stan\xc4\x81-'] self.assertEqual(result, expected)
def test_getPermutationsInitial(self): word = "*(s)tano" result = utils.getWordPermutations(word) expected = ['*tano', '*stano'] self.assertEqual(result, expected)