def map(self, clist, otherlist=None, out=None, full_search=False, similarity_level=5, language='en', skip_multiple=False): assert clist.exists(), "File %s does not exist" % clist from_ = read_dicts(clist) to = self._get_map_for_language(language, otherlist) cmap = (concept_map if full_search else concept_map2)( [i.get('GLOSS', i.get('ENGLISH')) for i in from_], [i[1] for i in to], similarity_level=similarity_level, freqs=self.frequencies, language=language ) good_matches = 0 with UnicodeWriter(out) as writer: writer.writerow( list(from_[0].keys()) + ['CONCEPTICON_ID', 'CONCEPTICON_GLOSS', 'SIMILARITY']) for i, item in enumerate(from_): row = list(item.values()) matches, sim = cmap.get(i, ([], 10)) if sim <= similarity_level: good_matches += 1 if not matches: writer.writerow(row + ['', '???', '']) elif len(matches) == 1: row.extend([ to[matches[0]][0], to[matches[0]][1].split('///')[0], sim]) writer.writerow(row) else: assert not full_search # we need a list to retain the order by frequency visited = [] for j in matches: gls, cid = to[j][0], to[j][1].split('///')[0] if (gls, cid) not in visited: visited += [(gls, cid)] if len(visited) > 1: if not skip_multiple: writer.writeblock( row + [gls, cid, sim] for gls, cid in visited) else: row.extend([visited[0][0], visited[0][1], sim]) writer.writerow(row) writer.writerow( ['#', '{0}/{1}'.format(good_matches, len(from_)), '{0:.0f}%'.format(100 * good_matches / len(from_))] + (len(from_[0]) - 1) * ['']) if out is None: print(writer.read().decode('utf-8'))
def lookup(args): """ Looks up a single gloss from the commandline. concepticon lookup <gloss1 gloss2 ... glossN> """ api = Concepticon() found = api.lookup( args.args, language=args.language, full_search=args.full_search, similarity_level=args.similarity) with UnicodeWriter(None) as writer: writer.writerow(['GLOSS', 'CONCEPTICON_ID', 'CONCEPTICON_GLOSS', 'SIMILARITY']) for matches in found: for m in matches: writer.writerow(m) print(writer.read().decode('utf-8'))
def _write_linking_data(api, l, args): out = defaultdict(int) freqs = defaultdict(int) for clist in api.conceptlists.values(): args.log.info("checking {clist.id}".format(clist=clist)) for row in clist.concepts.values(): if row.concepticon_id: gls = None if l.iso2 == 'en': if row.english: gls = row.english.strip('*$-—+') else: if l.name in row.attributes and row.attributes[l.name]: gls = row.attributes[l.name].strip('*$-—+') if gls: out[row.concepticon_gloss + '///' + gls, row.concepticon_id] += 1 freqs[row.concepticon_id] += 1 if l.iso2 == 'en': for cset in api.conceptsets.values(): gloss = cset.gloss if cset.ontological_category == 'Person/Thing': out[gloss + '///the ' + cset.gloss.lower(), cset.id] = freqs[cset.id] out[gloss + '///the ' + cset.gloss.lower() + 's', cset.id] = \ freqs[cset.id] elif cset.ontological_category == 'Action/Process': out[gloss + '///to ' + cset.gloss.lower(), cset.id] = freqs[cset.id] elif cset.ontological_category == 'Property': out[gloss + '///' + cset.gloss.lower() + ' (adjective)', cset.id] = \ freqs[cset.id] elif cset.ontological_category == 'Classifier': out[gloss + '///' + cset.gloss.lower() + ' (classifier)', cset.id] = \ freqs[cset.id] else: out[gloss + '///' + cset.gloss.lower(), cset.id] = freqs[cset.id] p = Path(pyconcepticon.__file__).parent.joinpath('data', 'map-{0}.tsv'.format(l.iso2)) with UnicodeWriter(p, delimiter='\t') as f: f.writerow(['ID', 'GLOSS', 'PRIORITY']) for i, (gloss, cid) in enumerate(sorted(out)): f.writerow([cid, gloss, out[gloss, cid]])
def lookup(args): """ Look up the specified glosses in Concepticon. Examples -------- $ concepticon lookup gloss1 gloss2 gloss3 ... """ api = Concepticon() found = api.lookup( args.args, language=args.language, full_search=args.full_search, similarity_level=args.similarity) with UnicodeWriter(None) as writer: writer.writerow(['GLOSS', 'CONCEPTICON_ID', 'CONCEPTICON_GLOSS', 'SIMILARITY']) for matches in found: for m in matches: writer.writerow(m) print(writer.read().decode('utf-8'))