def _get_map_for_language(self, language, otherlist=None): if (language, otherlist) not in self._to_mapping: if otherlist is not None: to = [] for item in read_dicts(otherlist): to.append((item['ID'], item.get('GLOSS', item.get('ENGLISH')))) else: mapfile = PKG_PATH.joinpath('data', 'map-{0}.tsv'.format(language)) to = [(cs['ID'], cs['GLOSS']) for cs in read_dicts(mapfile)] self._to_mapping[(language, otherlist)] = to return self._to_mapping[(language, otherlist)]
def conceptsets(self): """ :returns: `dict` mapping ConceptSet IDs to `Conceptset` instances. """ return to_dict( Conceptset(api=self, **lowercase(d)) for d in read_dicts(self.data_path('concepticon.tsv')))
def map(self, clist, otherlist=None, out=None, full_search=False, similarity_level=5, language='en', skip_multiple=False): assert clist.exists(), "File %s does not exist" % clist from_ = read_dicts(clist) to = self._get_map_for_language(language, otherlist) cmap = (concept_map if full_search else concept_map2)( [i.get('GLOSS', i.get('ENGLISH')) for i in from_], [i[1] for i in to], similarity_level=similarity_level, freqs=self.frequencies, language=language ) good_matches = 0 with UnicodeWriter(out) as writer: writer.writerow( list(from_[0].keys()) + ['CONCEPTICON_ID', 'CONCEPTICON_GLOSS', 'SIMILARITY']) for i, item in enumerate(from_): row = list(item.values()) matches, sim = cmap.get(i, ([], 10)) if sim <= similarity_level: good_matches += 1 if not matches: writer.writerow(row + ['', '???', '']) elif len(matches) == 1: row.extend([ to[matches[0]][0], to[matches[0]][1].split('///')[0], sim]) writer.writerow(row) else: assert not full_search # we need a list to retain the order by frequency visited = [] for j in matches: gls, cid = to[j][0], to[j][1].split('///')[0] if (gls, cid) not in visited: visited += [(gls, cid)] if len(visited) > 1: if not skip_multiple: writer.writeblock( row + [gls, cid, sim] for gls, cid in visited) else: row.extend([visited[0][0], visited[0][1], sim]) writer.writerow(row) writer.writerow( ['#', '{0}/{1}'.format(good_matches, len(from_)), '{0:.0f}%'.format(100 * good_matches / len(from_))] + (len(from_[0]) - 1) * ['']) if out is None: print(writer.read().decode('utf-8'))
def conceptlists(self): """ :returns: `dict` mapping ConceptList IDs to `Conceptlist` instances. .. note:: Individual concepts can be accessed via `Conceptlist.concepts`. """ return to_dict( Conceptlist(api=self, **lowercase(d)) for d in read_dicts(self.data_path('conceptlists.tsv')))
def _metadata(self, id_): values_path = self.data_path('concept_set_meta', id_ + '.tsv') md_path = self.data_path('concept_set_meta', id_ + '.tsv' + MD_SUFFIX) assert values_path.exists() and md_path.exists() md = jsonlib.load(md_path) return Metadata(id=id_, meta=md, values=to_dict( read_dicts(values_path, schema=md['tableSchema']), key=operator.itemgetter('CONCEPTICON_ID')))
def from_file(cls, path, **keywords): """ Function loads a concept list outside the Concepticon collection. """ path = Path(path) assert path.exists() attrs = {f: keywords.get(f, '') for f in Conceptlist.public_fields()} attrs.update(id=path.stem, items=keywords.get('items', len(read_dicts(path))), year=keywords.get('year', 0)) return cls(api=path, **attrs)
def concepts(self): res = [] if self.path.exists(): for item in read_dicts(self.path): kw, attributes = {}, {} for k, v in item.items(): if k: kl = k.lower() setitem(kw if kl in Concept.public_fields() else attributes, kl, v) res.append(Concept(list=self, attributes=attributes, **kw)) return to_dict(res)
def __init__(self, path): rels = defaultdict(dict) self.raw = list(read_dicts(path)) for item in self.raw: rels[item['SOURCE']][item['TARGET']] = item['RELATION'] rels[item['SOURCE_GLOSS']][item['TARGET_GLOSS']] = item['RELATION'] if item['RELATION'] in _INVERSE_RELATIONS: rels[item['TARGET']][item['SOURCE']] = \ _INVERSE_RELATIONS[item['RELATION']] rels[item['TARGET_GLOSS']][item['SOURCE_GLOSS']] = \ _INVERSE_RELATIONS[item['RELATION']] dict.__init__(self, rels.items())
def __init__(self, clid): self.clid = clid self.concepts = { 'CONCEPTICON_ID': {}, # maps ID to GLOSS 'CONCEPTICON_GLOSS': {}, # maps GLOSS to ID } for cs in read_dicts(data_path('concepticon.tsv')): self.concepts['CONCEPTICON_ID'][cs['ID']] = cs['GLOSS'] self.concepts['CONCEPTICON_GLOSS'][cs['GLOSS']] = cs['ID'] self._cid_index = None self._cgloss_index = None self._link_col = (None, None) self._number_index = None
def from_file(cls, path, **keywords): """ Function loads a concept list outside the Concepticon collection. @todo: uniqueness-check hier einbauen, siehe Funktion read_dicts """ path = Path(path) assert path.exists() attrs = {f: keywords.get(f, '') for f in Conceptlist.public_fields()} attrs.update(id=path.stem, items=keywords.get('items', len(read_dicts(path))), year=keywords.get('year', 0), local=True) return cls(api=path, **attrs)
def read_tsv(path, unique='ID'): uniquevalues = set() rows = [] for line, row in enumerate(read_dicts(path)): line += 2 if None in row: error('too many columns', path, line) # pragma: no cover if unique: if unique not in row: # pragma: no cover error('unique key missing: %s' % unique, path, line) continue if row[unique] in uniquevalues: # pragma: no cover error('non-unique %s: %s' % (unique, row[unique]), path, line) uniquevalues.add(row[unique]) rows.append((line, row)) return rows
def run(args): for cl in get_conceptlist(args, path_only=True): print(termcolor.colored(cl, attrs=['bold', 'underline'])) items = list(enumerate(read_dicts(cl), start=2)) for check in CHECKS: print( termcolor.colored('Check: {0}'.format(check.__name__), attrs=['bold'])) if args.verbose and check.__doc__: print(check.__doc__) try: check(items, args) except Exception as e: print( termcolor.colored('{0}: {1}'.format( e.__class__.__name__, e), color='red')) print()
def __init__(self, path, multiple=False): rels = defaultdict(lambda: defaultdict(set)) self.raw = list(read_dicts(path)) for item in self.raw: if multiple: rels[item['SOURCE']][item['TARGET']].add(item['RELATION']) rels[item['SOURCE_GLOSS']][item['TARGET_GLOSS']].add(item['RELATION']) if item['RELATION'] in _INVERSE_RELATIONS: rels[item['TARGET']][item['SOURCE']].add( _INVERSE_RELATIONS[item['RELATION']]) rels[item['TARGET_GLOSS']][item['SOURCE_GLOSS']].add( _INVERSE_RELATIONS[item['RELATION']]) else: rels[item['SOURCE']][item['TARGET']] = item['RELATION'] rels[item['SOURCE_GLOSS']][item['TARGET_GLOSS']] = item['RELATION'] if item['RELATION'] in _INVERSE_RELATIONS: rels[item['TARGET']][item['SOURCE']] = \ _INVERSE_RELATIONS[item['RELATION']] rels[item['TARGET_GLOSS']][item['SOURCE_GLOSS']] = \ _INVERSE_RELATIONS[item['RELATION']] dict.__init__(self, rels.items())
def conceptsets(self): return read_dicts(self.data_path('concepticon.tsv'))
def conceptlist(self, id_): return read_dicts(self.data_path('conceptlists', id_ + '.tsv'))
def map(self, clist, otherlist=None, out=None, full_search=False, similarity_level=5, language='en'): assert clist.exists(), "File %s does not exist" % clist from_ = [] for item in read_dicts(clist): from_.append((item.get('ID', item.get('NUMBER')), item.get('GLOSS', item.get('ENGLISH')))) to = self._get_map_for_language(language, otherlist) if not full_search: cmap = concept_map2([i[1] for i in from_], [i[1] for i in to], similarity_level=similarity_level, freqs=self.frequencies, language=language) good_matches = 0 with UnicodeWriter(out, delimiter='\t') as writer: writer.writerow([ 'ID', 'GLOSS', 'CONCEPTICON_ID', 'CONCEPTICON_GLOSS', 'SIMILARITY' ]) for i, (fid, fgloss) in enumerate(from_): row = [fid, fgloss] matches, sim = cmap.get(i, ([], 10)) if sim <= 5: good_matches += 1 if not matches: writer.writerow(row + ['', '???', '']) elif len(matches) == 1: row.extend([ to[matches[0]][0], to[matches[0]][1].split('///')[0], sim ]) writer.writerow(row) else: # we need a list to retain the order by frequency visited = [] for j in matches: gls, cid = to[j][0], to[j][1].split('///')[0] if (gls, cid) not in visited: visited += [(gls, cid)] if len(visited) > 1: writer.writerow(['<<<', '', '', '']) for gls, cid in visited: writer.writerow(row + [gls, cid, sim]) writer.writerow(['>>>', '', '', '']) else: row.extend([visited[0][0], visited[0][1], sim]) writer.writerow(row) writer.writerow([ '#', good_matches, len(from_), '{0:.2f}'.format(good_matches / len(from_)) ]) else: cmap = concept_map([i[1] for i in from_], [ i[1] for i in self._get_map_for_language(language, otherlist) ], similarity_level=similarity_level) with UnicodeWriter(out, delimiter='\t') as writer: writer.writerow( ['ID', 'GLOSS', 'CONCEPTICON_ID', 'CONCEPTICON_GLOSS']) for i, (fid, fgloss) in enumerate(from_): row = [fid, fgloss] match = cmap.get(i) row.extend(list(to[match[0]]) if match else ['', '']) writer.writerow(row) if out is None: print(writer.read().decode('utf-8'))
def check_new(args): """ Perform a number of sanity checks for a new concept list. Notes ----- Expects a well-formed concept list as input (i.e. tsv, 'ID', 'CONCEPTICON_ID', 'NUMBER', 'CONCEPTICON_GLOSS' columns, etc.) and tests for a number of potential issues: - mismatch between glosses and Concepticon IDs - availability of glosses in Concepticon - if proposed glosses (starting with !) don't have IDs (they shouldn't!) - if glosses are mapped more than once - if 'NUMBER' and 'ID' are unique for the respective concept list. Examples -------- $ concepticon checknew path_to_conceptlist.tsv """ list_to_check = read_dicts(args.args[0]) api = Concepticon(args.repos) con_glosses = {c.id: c.gloss for c in api.conceptsets.values()} def _get_duplicates(to_check): known_items = set() return [(i, key) for i, key in enumerate(to_check) if key in known_items or known_items.add(key)] for index, entry_to_check in enumerate(list_to_check): # Test if gloss matches Concepticon ID: try: if (con_glosses[entry_to_check['CONCEPTICON_ID']] != entry_to_check['CONCEPTICON_GLOSS']): print("Gloss " + entry_to_check['CONCEPTICON_GLOSS'] + " in line " + str(index + 1) + " doesn't match ID " + entry_to_check['CONCEPTICON_ID'] + ".") except KeyError: print("Gloss " + entry_to_check['CONCEPTICON_GLOSS'] + " in line " + str(index + 1) + " doesn't match ID " + entry_to_check['CONCEPTICON_ID'] + ".") # Test if gloss exists in Concepticon: try: if (entry_to_check['CONCEPTICON_GLOSS'] not in con_glosses.values()): print("Gloss " + entry_to_check['CONCEPTICON_GLOSS'] + " in line " + str(index + 1) + " doesn't exist in Concepticon.") except KeyError: print("Gloss " + entry_to_check[ 'CONCEPTICON_GLOSS'] + " in line " + str( index + 1) + " doesn't exist in Concepticon.") # Test if proposed glosses (!GLOSS) have NULL ID: try: if (entry_to_check['CONCEPTICON_GLOSS'].startswith('!') and entry_to_check['CONCEPTICON_ID']): print("Proposed gloss " + entry_to_check['CONCEPTICON_GLOSS'] + " in line " + str(index + 1) + " shouldn't have a CONCEPTICON_ID.") except KeyError: print("Proposed gloss " + entry_to_check['CONCEPTICON_GLOSS'] + " in line " + str(index + 1) + " shouldn't have a CONCEPTICON_ID.") print("\nChecking for uniquness of glosses:") try: glosses = _get_duplicates( [dict(d)['CONCEPTICON_GLOSS'] for d in list_to_check] ) for double in glosses: print("Gloss " + double[1] + " doubled in line " + str(double[0] + 3) + ".") except KeyError: pass print("\nChecking for uniqueness of 'NUMBER' and 'ID':") try: concept_ids = _get_duplicates( [dict(d)['ID'] for d in list_to_check] ) for double in concept_ids: print("ID " + double[1] + " doubled in line " + str(double[0] + 2) + ".") numbers = _get_duplicates( [dict(d)['NUMBER'] for d in list_to_check] ) for double in numbers: print("NUMBER " + double[1] + " doubled in line " + str(double[0] + 2) + ".") except KeyError: pass
def conceptlists_dicts(self): return read_dicts(self.data_path('conceptlists.tsv'))