def run(self, doPrint=False): self.converter.writer.writeheader() termsTags = self.getParsedHTML().findAll("dt") for dt in termsTags: dd = dt.findNextSibling() self.stripTags(dd) gla = stripAllHTML(self.fixUp(dt.getText(" "))) glaStems = self.getStems(gla, withUnicode=True) eng = stripAllHTML(unicode(self.fixUp(str(dd)).decode("utf-8"))) engStems = self.getStems(eng) try: glaPhones = utils.getMetaphones(glaStems) engPhones = utils.getMetaphones(engStems) except Exception, err: import pdb pdb.set_trace() row = { self.converter.fields[0]: gla, self.converter.fields[1]: eng, self.converter.fields[2]: "", self.converter.fields[3]: json.dumps(glaStems), self.converter.fields[4]: json.dumps(engStems), self.converter.fields[5]: json.dumps(glaPhones), self.converter.fields[6]: json.dumps(engPhones), } try: self.converter.writer.writerow(row) except Exception, err: import pdb pdb.set_trace()
def run(self, doPrint=False): self.converter.writer.writeheader() termsTags = self.getParsedHTML().findAll("dt") for dt in termsTags: dd = dt.findNextSibling() self.stripTags(dd) gla = stripAllHTML(self.fixUp(dt.getText(" "))) glaStems = self.getStems(gla, withUnicode=True) eng = stripAllHTML(unicode(self.fixUp(str(dd)).decode("utf-8"))) engStems = self.getStems(eng) try: glaPhones = utils.getMetaphones(glaStems) engPhones = utils.getMetaphones(engStems) except Exception, err: import pdb;pdb.set_trace() row = { self.converter.fields[0]: gla, self.converter.fields[1]: eng, self.converter.fields[2]: "", self.converter.fields[3]: json.dumps(glaStems), self.converter.fields[4]: json.dumps(engStems), self.converter.fields[5]: json.dumps(glaPhones), self.converter.fields[6]: json.dumps(engPhones), } try: self.converter.writer.writerow(row) except Exception, err: import pdb;pdb.set_trace()
def run(self, doPrint=False): self.converter.writer.writeheader() rowTags = self.getParsedHTML(convertEntities=False).findAll("tr") for tr in rowTags: cells = tr.findAll("td") if len(cells) == 0: continue # skip the first cell, which is page numbers from Pokorny's PIE # dictionary terms, seeAlsos, definition = cells[1:] pieTerms = [] for term in terms.findAll("span"): pieTerms.append(term.text) pieTerms.extend(utils.getWordPermutations(term.text)) # we're going to add these to the keywords too pieSeeAlsos = [x.text for x in seeAlsos.findAll("span")] # clean up definitions text self.stripTags(definition) # put it all together for pie in pieTerms: pieStems = self.getStems(pie, withUnicode=True) [ pieStems.extend(self.getStems(x, withUnicode=True)) for x in pieSeeAlsos ] eng = unicode(self.fixUp(str(definition)).decode("utf-8")) engStems = self.getStems(eng) try: piePhones = utils.getMetaphones( set(pieStems + pieTerms + pieSeeAlsos)) engPhones = utils.getMetaphones(engStems) except Exception, err: import pdb pdb.set_trace() row = { self.converter.fields[0]: pie, self.converter.fields[1]: eng, self.converter.fields[2]: json.dumps(pieSeeAlsos), self.converter.fields[3]: json.dumps(pieStems), self.converter.fields[4]: json.dumps(engStems), self.converter.fields[5]: json.dumps(piePhones), self.converter.fields[6]: json.dumps(engPhones), } try: self.converter.writer.writerow(row) except Exception, err: import pdb pdb.set_trace()
def run(self, doPrint=False): self.converter.writer.writeheader() rowTags = self.getParsedHTML(convertEntities=False).findAll("tr") for tr in rowTags: cells = tr.findAll("td") if len(cells) == 0: continue # skip the first cell, which is page numbers from Pokorny's PIE # dictionary terms, seeAlsos, definition = cells[1:] pieTerms = [] for term in terms.findAll("span"): pieTerms.append(term.text) pieTerms.extend(utils.getWordPermutations(term.text)) # we're going to add these to the keywords too pieSeeAlsos = [x.text for x in seeAlsos.findAll("span")] # clean up definitions text self.stripTags(definition) # put it all together for pie in pieTerms: pieStems = self.getStems(pie, withUnicode=True) [pieStems.extend(self.getStems(x, withUnicode=True)) for x in pieSeeAlsos] eng = unicode(self.fixUp(str(definition)).decode("utf-8")) engStems = self.getStems(eng) try: piePhones = utils.getMetaphones( set(pieStems + pieTerms + pieSeeAlsos)) engPhones = utils.getMetaphones(engStems) except Exception, err: import pdb;pdb.set_trace() row = { self.converter.fields[0]: pie, self.converter.fields[1]: eng, self.converter.fields[2]: json.dumps(pieSeeAlsos), self.converter.fields[3]: json.dumps(pieStems), self.converter.fields[4]: json.dumps(engStems), self.converter.fields[5]: json.dumps(piePhones), self.converter.fields[6]: json.dumps(engPhones), } try: self.converter.writer.writerow(row) except Exception, err: import pdb pdb.set_trace()
def run(self): super(AddProtoCelticKeywords, self).run() reader = unicsv.UnicodeReader(self.inFilename) fieldnames = collection.ProtoCelticDictionaryV1().fields writer = unicsv.UnicodeWriter(self.outFilename, fieldnames) writer.writeheader() for row in reader: pclOrig = row["pcl"].split() engOrig = row["eng"].split() pcl = utils.getUnicodeStems(pclOrig) eng = utils.getStems(engOrig) row["see-also"] = "" row["pcl-keywords"] = json.dumps(utils.getUnicodeStems(pcl)) row["eng-keywords"] = json.dumps(utils.getStems(eng)) pcl = pcl + pclOrig eng = eng + engOrig try: row["pcl-metaphone"] = json.dumps(utils.getMetaphones(pcl)) row["eng-metaphone"] = json.dumps(utils.getMetaphones(eng)) except Exception, err: import pdb;pdb.set_trace() writer.writerow(row)
def run(self): super(AddProtoCelticKeywords, self).run() reader = unicsv.UnicodeReader(self.inFilename) fieldnames = collection.ProtoCelticDictionaryV1().fields writer = unicsv.UnicodeWriter(self.outFilename, fieldnames) writer.writeheader() for row in reader: pclOrig = row["pcl"].split() engOrig = row["eng"].split() pcl = utils.getUnicodeStems(pclOrig) eng = utils.getStems(engOrig) row["see-also"] = "" row["pcl-keywords"] = json.dumps(utils.getUnicodeStems(pcl)) row["eng-keywords"] = json.dumps(utils.getStems(eng)) pcl = pcl + pclOrig eng = eng + engOrig try: row["pcl-metaphone"] = json.dumps(utils.getMetaphones(pcl)) row["eng-metaphone"] = json.dumps(utils.getMetaphones(eng)) except Exception, err: import pdb pdb.set_trace() writer.writerow(row)
def test_getMetaphonesMultipleWords(self): words = ["This", "is", "Baxter's", "favorite", "seat"] results = utils.getMetaphones(words) self.assertEquals(results, ['0S', 'AS', 'FFRT', 'PKSTRRS', 'ST', 'TS'])
def test_getMetaphonesSentence(self): text = "This is Baxter's favorite seat" results = utils.getMetaphones(text) self.assertEquals(results, ['0S', 'AS', 'FFRT', 'PKSTRRS', 'ST', 'TS'])
def test_getMetaphonesUnicodeWord(self): text = "φrāko" results = utils.getMetaphones(text) self.assertEquals(results, ['0RK', 'TRK'])
def test_getMetaphonesWord(self): text = "Baxter" results = utils.getMetaphones(text) self.assertEquals(results, ['PKSTR'])