Esempio n. 1
0
 def _get_map_for_language(self, language, otherlist=None):
     if (language, otherlist) not in self._to_mapping:
         if otherlist is not None:
             to = []
             for item in read_dicts(otherlist):
                 to.append((item['ID'], item.get('GLOSS', item.get('ENGLISH'))))
         else:
             mapfile = PKG_PATH.joinpath('data', 'map-{0}.tsv'.format(language))
             to = [(cs['ID'], cs['GLOSS']) for cs in read_dicts(mapfile)]
         self._to_mapping[(language, otherlist)] = to
     return self._to_mapping[(language, otherlist)]
Esempio n. 2
0
 def conceptsets(self):
     """
     :returns: `dict` mapping ConceptSet IDs to `Conceptset` instances.
     """
     return to_dict(
         Conceptset(api=self, **lowercase(d))
         for d in read_dicts(self.data_path('concepticon.tsv')))
Esempio n. 3
0
    def map(self,
            clist,
            otherlist=None,
            out=None,
            full_search=False,
            similarity_level=5,
            language='en',
            skip_multiple=False):
        assert clist.exists(), "File %s does not exist" % clist
        from_ = read_dicts(clist)

        to = self._get_map_for_language(language, otherlist)
        cmap = (concept_map if full_search else concept_map2)(
            [i.get('GLOSS', i.get('ENGLISH')) for i in from_],
            [i[1] for i in to],
            similarity_level=similarity_level,
            freqs=self.frequencies,
            language=language
        )
        good_matches = 0
        with UnicodeWriter(out) as writer:
            writer.writerow(
                list(from_[0].keys()) +
                ['CONCEPTICON_ID', 'CONCEPTICON_GLOSS', 'SIMILARITY'])
            for i, item in enumerate(from_):
                row = list(item.values())
                matches, sim = cmap.get(i, ([], 10))
                if sim <= similarity_level:
                    good_matches += 1
                if not matches:
                    writer.writerow(row + ['', '???', ''])
                elif len(matches) == 1:
                    row.extend([
                        to[matches[0]][0], to[matches[0]][1].split('///')[0], sim])
                    writer.writerow(row)
                else:
                    assert not full_search
                    # we need a list to retain the order by frequency
                    visited = []
                    for j in matches:
                        gls, cid = to[j][0], to[j][1].split('///')[0]
                        if (gls, cid) not in visited:
                            visited += [(gls, cid)]
                    if len(visited) > 1:
                        if not skip_multiple:
                            writer.writeblock(
                                row + [gls, cid, sim] for gls, cid in visited)
                    else:
                        row.extend([visited[0][0], visited[0][1], sim])
                        writer.writerow(row)
            writer.writerow(
                ['#',
                 '{0}/{1}'.format(good_matches, len(from_)),
                 '{0:.0f}%'.format(100 * good_matches / len(from_))] +
                (len(from_[0]) - 1) * [''])

        if out is None:
            print(writer.read().decode('utf-8'))
Esempio n. 4
0
    def conceptlists(self):
        """
        :returns: `dict` mapping ConceptList IDs to `Conceptlist` instances.

        .. note:: Individual concepts can be accessed via `Conceptlist.concepts`.
        """
        return to_dict(
            Conceptlist(api=self, **lowercase(d))
            for d in read_dicts(self.data_path('conceptlists.tsv')))
Esempio n. 5
0
 def _metadata(self, id_):
     values_path = self.data_path('concept_set_meta', id_ + '.tsv')
     md_path = self.data_path('concept_set_meta', id_ + '.tsv' + MD_SUFFIX)
     assert values_path.exists() and md_path.exists()
     md = jsonlib.load(md_path)
     return Metadata(id=id_,
                     meta=md,
                     values=to_dict(
                         read_dicts(values_path, schema=md['tableSchema']),
                         key=operator.itemgetter('CONCEPTICON_ID')))
Esempio n. 6
0
 def from_file(cls, path, **keywords):
     """
     Function loads a concept list outside the Concepticon collection.
     """
     path = Path(path)
     assert path.exists()
     attrs = {f: keywords.get(f, '') for f in Conceptlist.public_fields()}
     attrs.update(id=path.stem,
                  items=keywords.get('items', len(read_dicts(path))),
                  year=keywords.get('year', 0))
     return cls(api=path, **attrs)
Esempio n. 7
0
 def concepts(self):
     res = []
     if self.path.exists():
         for item in read_dicts(self.path):
             kw, attributes = {}, {}
             for k, v in item.items():
                 if k:
                     kl = k.lower()
                     setitem(kw if kl in Concept.public_fields() else attributes, kl, v)
             res.append(Concept(list=self, attributes=attributes, **kw))
     return to_dict(res)
Esempio n. 8
0
 def __init__(self, path):
     rels = defaultdict(dict)
     self.raw = list(read_dicts(path))
     for item in self.raw:
         rels[item['SOURCE']][item['TARGET']] = item['RELATION']
         rels[item['SOURCE_GLOSS']][item['TARGET_GLOSS']] = item['RELATION']
         if item['RELATION'] in _INVERSE_RELATIONS:
             rels[item['TARGET']][item['SOURCE']] = \
                 _INVERSE_RELATIONS[item['RELATION']]
             rels[item['TARGET_GLOSS']][item['SOURCE_GLOSS']] = \
                 _INVERSE_RELATIONS[item['RELATION']]
     dict.__init__(self, rels.items())
    def __init__(self, clid):
        self.clid = clid
        self.concepts = {
            'CONCEPTICON_ID': {},  # maps ID to GLOSS
            'CONCEPTICON_GLOSS': {},  # maps GLOSS to ID
        }
        for cs in read_dicts(data_path('concepticon.tsv')):
            self.concepts['CONCEPTICON_ID'][cs['ID']] = cs['GLOSS']
            self.concepts['CONCEPTICON_GLOSS'][cs['GLOSS']] = cs['ID']

        self._cid_index = None
        self._cgloss_index = None
        self._link_col = (None, None)
        self._number_index = None
Esempio n. 10
0
    def from_file(cls, path, **keywords):
        """
        Function loads a concept list outside the Concepticon collection.

        @todo: uniqueness-check hier einbauen, siehe Funktion read_dicts
        """
        path = Path(path)
        assert path.exists()
        attrs = {f: keywords.get(f, '') for f in Conceptlist.public_fields()}
        attrs.update(id=path.stem,
                     items=keywords.get('items', len(read_dicts(path))),
                     year=keywords.get('year', 0),
                     local=True)
        return cls(api=path, **attrs)
Esempio n. 11
0
def read_tsv(path, unique='ID'):
    uniquevalues = set()
    rows = []
    for line, row in enumerate(read_dicts(path)):
        line += 2
        if None in row:
            error('too many columns', path, line)  # pragma: no cover
        if unique:
            if unique not in row:  # pragma: no cover
                error('unique key missing: %s' % unique, path, line)
                continue
            if row[unique] in uniquevalues:  # pragma: no cover
                error('non-unique %s: %s' % (unique, row[unique]), path, line)
            uniquevalues.add(row[unique])
        rows.append((line, row))
    return rows
Esempio n. 12
0
def run(args):
    for cl in get_conceptlist(args, path_only=True):
        print(termcolor.colored(cl, attrs=['bold', 'underline']))
        items = list(enumerate(read_dicts(cl), start=2))
        for check in CHECKS:
            print(
                termcolor.colored('Check: {0}'.format(check.__name__),
                                  attrs=['bold']))
            if args.verbose and check.__doc__:
                print(check.__doc__)
            try:
                check(items, args)
            except Exception as e:
                print(
                    termcolor.colored('{0}: {1}'.format(
                        e.__class__.__name__, e),
                                      color='red'))
        print()
Esempio n. 13
0
 def __init__(self, path, multiple=False):
     rels = defaultdict(lambda: defaultdict(set))
     self.raw = list(read_dicts(path))
     for item in self.raw:
         if multiple:
             rels[item['SOURCE']][item['TARGET']].add(item['RELATION'])
             rels[item['SOURCE_GLOSS']][item['TARGET_GLOSS']].add(item['RELATION'])
             if item['RELATION'] in _INVERSE_RELATIONS:
                 rels[item['TARGET']][item['SOURCE']].add(
                     _INVERSE_RELATIONS[item['RELATION']])
                 rels[item['TARGET_GLOSS']][item['SOURCE_GLOSS']].add(
                     _INVERSE_RELATIONS[item['RELATION']])
         else:
             rels[item['SOURCE']][item['TARGET']] = item['RELATION']
             rels[item['SOURCE_GLOSS']][item['TARGET_GLOSS']] = item['RELATION']
             if item['RELATION'] in _INVERSE_RELATIONS:
                 rels[item['TARGET']][item['SOURCE']] = \
                     _INVERSE_RELATIONS[item['RELATION']]
                 rels[item['TARGET_GLOSS']][item['SOURCE_GLOSS']] = \
                     _INVERSE_RELATIONS[item['RELATION']]
     dict.__init__(self, rels.items())
Esempio n. 14
0
 def conceptsets(self):
     return read_dicts(self.data_path('concepticon.tsv'))
Esempio n. 15
0
 def conceptlist(self, id_):
     return read_dicts(self.data_path('conceptlists', id_ + '.tsv'))
Esempio n. 16
0
    def map(self,
            clist,
            otherlist=None,
            out=None,
            full_search=False,
            similarity_level=5,
            language='en'):
        assert clist.exists(), "File %s does not exist" % clist
        from_ = []
        for item in read_dicts(clist):
            from_.append((item.get('ID', item.get('NUMBER')),
                          item.get('GLOSS', item.get('ENGLISH'))))

        to = self._get_map_for_language(language, otherlist)

        if not full_search:
            cmap = concept_map2([i[1] for i in from_], [i[1] for i in to],
                                similarity_level=similarity_level,
                                freqs=self.frequencies,
                                language=language)
            good_matches = 0
            with UnicodeWriter(out, delimiter='\t') as writer:
                writer.writerow([
                    'ID', 'GLOSS', 'CONCEPTICON_ID', 'CONCEPTICON_GLOSS',
                    'SIMILARITY'
                ])
                for i, (fid, fgloss) in enumerate(from_):
                    row = [fid, fgloss]
                    matches, sim = cmap.get(i, ([], 10))
                    if sim <= 5:
                        good_matches += 1
                    if not matches:
                        writer.writerow(row + ['', '???', ''])
                    elif len(matches) == 1:
                        row.extend([
                            to[matches[0]][0],
                            to[matches[0]][1].split('///')[0], sim
                        ])
                        writer.writerow(row)
                    else:
                        # we need a list to retain the order by frequency
                        visited = []
                        for j in matches:
                            gls, cid = to[j][0], to[j][1].split('///')[0]
                            if (gls, cid) not in visited:
                                visited += [(gls, cid)]
                        if len(visited) > 1:
                            writer.writerow(['<<<', '', '', ''])
                            for gls, cid in visited:
                                writer.writerow(row + [gls, cid, sim])
                            writer.writerow(['>>>', '', '', ''])
                        else:
                            row.extend([visited[0][0], visited[0][1], sim])
                            writer.writerow(row)
                writer.writerow([
                    '#', good_matches,
                    len(from_), '{0:.2f}'.format(good_matches / len(from_))
                ])
        else:
            cmap = concept_map([i[1] for i in from_], [
                i[1] for i in self._get_map_for_language(language, otherlist)
            ],
                               similarity_level=similarity_level)
            with UnicodeWriter(out, delimiter='\t') as writer:
                writer.writerow(
                    ['ID', 'GLOSS', 'CONCEPTICON_ID', 'CONCEPTICON_GLOSS'])
                for i, (fid, fgloss) in enumerate(from_):
                    row = [fid, fgloss]
                    match = cmap.get(i)
                    row.extend(list(to[match[0]]) if match else ['', ''])
                    writer.writerow(row)

        if out is None:
            print(writer.read().decode('utf-8'))
Esempio n. 17
0
def check_new(args):
    """
    Perform a number of sanity checks for a new concept list.

    Notes
    -----
    Expects a well-formed concept list as input (i.e. tsv, 'ID',
    'CONCEPTICON_ID', 'NUMBER', 'CONCEPTICON_GLOSS' columns, etc.) and tests
    for a number of potential issues:
        - mismatch between glosses and Concepticon IDs
        - availability of glosses in Concepticon
        - if proposed glosses (starting with !) don't have IDs (they shouldn't!)
        - if glosses are mapped more than once
        - if 'NUMBER' and 'ID' are unique for the respective concept list.

    Examples
    --------
    $ concepticon checknew path_to_conceptlist.tsv
    """
    list_to_check = read_dicts(args.args[0])
    api = Concepticon(args.repos)
    con_glosses = {c.id: c.gloss for c in api.conceptsets.values()}

    def _get_duplicates(to_check):
        known_items = set()
        return [(i, key) for i, key in enumerate(to_check)
                if key in known_items or known_items.add(key)]

    for index, entry_to_check in enumerate(list_to_check):
        # Test if gloss matches Concepticon ID:
        try:
            if (con_glosses[entry_to_check['CONCEPTICON_ID']] !=
                    entry_to_check['CONCEPTICON_GLOSS']):
                print("Gloss " + entry_to_check['CONCEPTICON_GLOSS'] +
                      " in line " + str(index + 1) + " doesn't match ID " +
                      entry_to_check['CONCEPTICON_ID'] + ".")
        except KeyError:
            print("Gloss " + entry_to_check['CONCEPTICON_GLOSS'] +
                  " in line " + str(index + 1) + " doesn't match ID " +
                  entry_to_check['CONCEPTICON_ID'] + ".")

        # Test if gloss exists in Concepticon:
        try:
            if (entry_to_check['CONCEPTICON_GLOSS']
                    not in con_glosses.values()):
                print("Gloss " + entry_to_check['CONCEPTICON_GLOSS'] +
                      " in line " + str(index + 1) +
                      " doesn't exist in Concepticon.")
        except KeyError:
            print("Gloss " + entry_to_check[
                'CONCEPTICON_GLOSS'] + " in line " + str(
                index + 1) + " doesn't exist in Concepticon.")

        # Test if proposed glosses (!GLOSS) have NULL ID:
        try:
            if (entry_to_check['CONCEPTICON_GLOSS'].startswith('!') and
                    entry_to_check['CONCEPTICON_ID']):
                print("Proposed gloss " + entry_to_check['CONCEPTICON_GLOSS'] +
                      " in line " + str(index + 1) +
                      " shouldn't have a CONCEPTICON_ID.")
        except KeyError:
            print("Proposed gloss " + entry_to_check['CONCEPTICON_GLOSS'] +
                  " in line " + str(index + 1) +
                  " shouldn't have a CONCEPTICON_ID.")

    print("\nChecking for uniquness of glosses:")
    try:
        glosses = _get_duplicates(
            [dict(d)['CONCEPTICON_GLOSS'] for d in list_to_check]
        )

        for double in glosses:
            print("Gloss " + double[1] +
                  " doubled in line " + str(double[0] + 3) + ".")
    except KeyError:
        pass

    print("\nChecking for uniqueness of 'NUMBER' and 'ID':")
    try:
        concept_ids = _get_duplicates(
            [dict(d)['ID'] for d in list_to_check]
        )

        for double in concept_ids:
            print("ID " + double[1] +
                  " doubled in line " + str(double[0] + 2) + ".")

        numbers = _get_duplicates(
            [dict(d)['NUMBER'] for d in list_to_check]
        )

        for double in numbers:
            print("NUMBER " + double[1] +
                  " doubled in line " + str(double[0] + 2) + ".")
    except KeyError:
        pass
Esempio n. 18
0
 def conceptlists_dicts(self):
     return read_dicts(self.data_path('conceptlists.tsv'))