Esempio n. 1
0
    def map(self,
            clist,
            otherlist=None,
            out=None,
            full_search=False,
            similarity_level=5,
            language='en',
            skip_multiple=False):
        assert clist.exists(), "File %s does not exist" % clist
        from_ = read_dicts(clist)

        to = self._get_map_for_language(language, otherlist)
        cmap = (concept_map if full_search else concept_map2)(
            [i.get('GLOSS', i.get('ENGLISH')) for i in from_],
            [i[1] for i in to],
            similarity_level=similarity_level,
            freqs=self.frequencies,
            language=language
        )
        good_matches = 0
        with UnicodeWriter(out) as writer:
            writer.writerow(
                list(from_[0].keys()) +
                ['CONCEPTICON_ID', 'CONCEPTICON_GLOSS', 'SIMILARITY'])
            for i, item in enumerate(from_):
                row = list(item.values())
                matches, sim = cmap.get(i, ([], 10))
                if sim <= similarity_level:
                    good_matches += 1
                if not matches:
                    writer.writerow(row + ['', '???', ''])
                elif len(matches) == 1:
                    row.extend([
                        to[matches[0]][0], to[matches[0]][1].split('///')[0], sim])
                    writer.writerow(row)
                else:
                    assert not full_search
                    # we need a list to retain the order by frequency
                    visited = []
                    for j in matches:
                        gls, cid = to[j][0], to[j][1].split('///')[0]
                        if (gls, cid) not in visited:
                            visited += [(gls, cid)]
                    if len(visited) > 1:
                        if not skip_multiple:
                            writer.writeblock(
                                row + [gls, cid, sim] for gls, cid in visited)
                    else:
                        row.extend([visited[0][0], visited[0][1], sim])
                        writer.writerow(row)
            writer.writerow(
                ['#',
                 '{0}/{1}'.format(good_matches, len(from_)),
                 '{0:.0f}%'.format(100 * good_matches / len(from_))] +
                (len(from_[0]) - 1) * [''])

        if out is None:
            print(writer.read().decode('utf-8'))
Esempio n. 2
0
def lookup(args):
    """
    Looks up a single gloss from the commandline.

    concepticon lookup <gloss1 gloss2 ... glossN>
    """
    api = Concepticon()
    found = api.lookup(
        args.args,
        language=args.language,
        full_search=args.full_search,
        similarity_level=args.similarity)
    with UnicodeWriter(None) as writer:
        writer.writerow(['GLOSS', 'CONCEPTICON_ID', 'CONCEPTICON_GLOSS', 'SIMILARITY'])
        for matches in found:
            for m in matches:
                writer.writerow(m)
        print(writer.read().decode('utf-8'))
Esempio n. 3
0
def _write_linking_data(api, l, args):
    out = defaultdict(int)
    freqs = defaultdict(int)

    for clist in api.conceptlists.values():
        args.log.info("checking {clist.id}".format(clist=clist))
        for row in clist.concepts.values():
            if row.concepticon_id:
                gls = None
                if l.iso2 == 'en':
                    if row.english:
                        gls = row.english.strip('*$-—+')
                else:
                    if l.name in row.attributes and row.attributes[l.name]:
                        gls = row.attributes[l.name].strip('*$-—+')

                if gls:
                    out[row.concepticon_gloss + '///' + gls, row.concepticon_id] += 1
                    freqs[row.concepticon_id] += 1

    if l.iso2 == 'en':
        for cset in api.conceptsets.values():
            gloss = cset.gloss
            if cset.ontological_category == 'Person/Thing':
                out[gloss + '///the ' + cset.gloss.lower(), cset.id] = freqs[cset.id]
                out[gloss + '///the ' + cset.gloss.lower() + 's', cset.id] = \
                    freqs[cset.id]
            elif cset.ontological_category == 'Action/Process':
                out[gloss + '///to ' + cset.gloss.lower(), cset.id] = freqs[cset.id]
            elif cset.ontological_category == 'Property':
                out[gloss + '///' + cset.gloss.lower() + ' (adjective)', cset.id] = \
                    freqs[cset.id]
            elif cset.ontological_category == 'Classifier':
                out[gloss + '///' + cset.gloss.lower() + ' (classifier)', cset.id] = \
                    freqs[cset.id]
            else:
                out[gloss + '///' + cset.gloss.lower(), cset.id] = freqs[cset.id]

    p = Path(pyconcepticon.__file__).parent.joinpath('data', 'map-{0}.tsv'.format(l.iso2))
    with UnicodeWriter(p, delimiter='\t') as f:
        f.writerow(['ID', 'GLOSS', 'PRIORITY'])
        for i, (gloss, cid) in enumerate(sorted(out)):
            f.writerow([cid, gloss, out[gloss, cid]])
Esempio n. 4
0
def lookup(args):
    """
    Look up the specified glosses in Concepticon.

    Examples
    --------
    $ concepticon lookup gloss1 gloss2 gloss3 ...
    """
    api = Concepticon()
    found = api.lookup(
        args.args,
        language=args.language,
        full_search=args.full_search,
        similarity_level=args.similarity)

    with UnicodeWriter(None) as writer:
        writer.writerow(['GLOSS', 'CONCEPTICON_ID', 'CONCEPTICON_GLOSS', 'SIMILARITY'])
        for matches in found:
            for m in matches:
                writer.writerow(m)
        print(writer.read().decode('utf-8'))