Ejemplo n.º 1
0
def make_derivations(daemon):
    global pypath, projdir, datapath, idsrch
    allfiles = []
    esrlpath = os.path.join(projdir, 'data', 'ldc', daemon, 'ccgbank')
    if not os.path.exists(esrlpath):
        os.makedirs(esrlpath)

    progress = 0
    svc = grpc.CcgParserService(daemon)
    stub = svc.open_client()

    failed_total = 0
    ldcpath = os.path.join(projdir, 'data', 'ldc', 'ccgbank_1_1', 'data',
                           'RAW')
    dirlist = os.listdir(ldcpath)

    try:
        for fname in dirlist:
            ldcpath1 = os.path.join(ldcpath, fname)
            with open(ldcpath1, 'r') as fd:
                lines = fd.readlines()

            m = idsrch.match(os.path.basename(ldcpath1))
            if m is None:
                continue

            derivations = []
            failed_parse = []
            for ln in lines:
                # Parse with EasySRL via gRPC
                try:
                    ccg = grpc.ccg_parse(stub, ln)
                    derivations.append(safe_utf8_encode(ccg.replace('\n', '')))
                except Exception as e:
                    failed_parse.append(safe_utf8_encode(ln.strip()))
                    # Add comment so line numbers match id's
                    derivations.append(
                        safe_utf8_encode('# FAILED: ' + ln.strip()))
                progress = print_progress(progress, 10)
            id = m.group('id')
            if len(derivations) != 0:
                with open(os.path.join(esrlpath, 'ccg_derivation%s.txt' % id),
                          'w') as fd:
                    fd.write(b'\n'.join(derivations))

            failed_total += len(failed_parse)
            if len(failed_parse) != 0:
                with open(os.path.join(esrlpath, 'ccg_failed%s.txt' % id),
                          'w') as fd:
                    fd.write(b'\n'.join(failed_parse))
    finally:
        print_progress(progress, 10, done=True)
        svc.shutdown()

    if failed_total != 0:
        print('THERE WERE %d PARSE FAILURES' % failed_total)
Ejemplo n.º 2
0
 def save(self, stream):
     self.modify_lock.acquire()
     try:
         stream.write(b'%d:%d\n' % (self.max_edit_distance, self.longest_word_length))
         for k, v in self.dictionary.iteritems():
             stream.write(safe_utf8_encode(k))
             stream.write(b':')
             stream.write(safe_utf8_encode(str(v[1])))
             stream.write(b':')
             stream.write(safe_utf8_encode(':'.join(v[0])))
             stream.write(b'\n')
     finally:
         self.modify_lock.release()
Ejemplo n.º 3
0
    def __init__(self, daemon, workdir=None, jarfile=None, extra_args=None, debug=False):
        """Create a CCG Parse Service.

        Args:
            daemon: 'easysrl' or 'neuralccg'.
            workdir: Optional path to daemon if in release mode.
        """
        global _logger, _GRPC_RUNNING
        self.workdir = safe_utf8_encode(workdir) if workdir else os.getcwd()
        self.grpc_stop_onclose = False
        self.daemon_name = safe_utf8_encode(daemon)
        self.child = None
        extra_args = None if extra_args is None else [safe_utf8_encode(a) for a in extra_args]
        try:
            # Check if easyxxx service has started. If not start it.
            self.grpc_stub, _ = get_client_transport('localhost', self.daemon_port)
            ccg_parse(self.grpc_stub, '')
        except Exception:
            # Not started
            _logger.info('Starting %s gRPC daemon', self.daemon_name)

            if USE_DEVEL_PATH and jarfile is None:
                cmdline = [os.path.join(PROJDIR, 'scripts', 'start_server.sh'), daemon]
                if extra_args is not None:
                    cmdline.extend(extra_args)
                subprocess.call(cmdline)
                time.sleep(self._WAIT_TIME)   # Give it some time to lock session access
            elif jarfile is not None:
                log_file = os.path.join(workdir, self.daemon_name + '.log')
                if debug:
                    cmdline = ['/usr/bin/java', '-Dlog4j.debug', '-jar', jarfile, '--daemonize']
                else:
                    cmdline = ['/usr/bin/java', '-jar', jarfile, '--daemonize']
                if extra_args is not None:
                    cmdline.extend(extra_args)
                _logger.debug(cmdline)
                if debug:
                    self.child = subprocess.Popen(cmdline)
                else:
                    self.child = subprocess.Popen(cmdline, stdout=open('/dev/null', 'w'), stderr=open('/dev/null', 'w'))
                time.sleep(self._WAIT_TIME)
                os.kill(self.child.pid, 0)
                self.grpc_stop_onclose = True
                _logger.info('started child daemon with pid %d', self.child.pid)
            else:
                raise ValueError('CcgParserService.__init__()')
            _GRPC_RUNNING.add(self)
            self.stub, _ = get_client_transport('localhost', self.daemon_port)
            # Call asynchronously - will wait until default session is created
            ccg_parse(self.stub, '', timeout=120)
            self.grpc_stop_onclose = True
Ejemplo n.º 4
0
def ccg_parse(client, sentence, session_id=DEFAULT_SESSION, timeout=0):
    """Parse the sentence using the specified session.

    Args:
        client: The client end-point stub returned from get_client_transport()
        sentence: The sentence. Can be unicode, utf-8, or ascii.
        session_id: Optional session id.
        timeout: If non-zero make the call asynchronously with timeout equal to this value.
            Typically not needed unless the call may timeout when run synchronously.

    Returns:
        The response message string .
    """
    isUnicode = isinstance(sentence, unicode)
    if isUnicode:
        # CCG Parser is Java so input must be utf-8 or ascii
        sentence = sentence.encode('utf-8')
    query_input = create_query_input('text', sentence)
    request = Request()
    request.LUCID = session_id
    request.spec.name = 'infer'
    request.spec.content.extend([query_input])
    if timeout <= 0:
        response = client.infer(request)
    else:
        infer_future = client.infer.future(request, timeout)
        # FIXME: Need to add error reporting to Response structure.
        response = infer_future.result()
    if future_string == unicode:
        isUnicode = True
    if isinstance(response.msg, unicode):
        return response.msg if isUnicode else safe_utf8_encode(response.msg)
    return response.msg if not isUnicode else safe_utf8_decode(response.msg)
Ejemplo n.º 5
0
def build_from_model(fn_dict, outdir, modelPath, verbose=False, verify=True):
    print('Building function templates from model folder...')
    fname = os.path.join(modelPath, 'markedup')
    if not os.path.exists(fname) or not os.path.isfile(fname):
        print('Error: %s does not exist or is not a file' % fname)

    with open(fname, 'r') as fd:
        signatures = fd.readlines()

    failed_rules = []
    progress = 0
    for sig in signatures:
        predarg = Category(sig.strip())
        progress = print_progress(progress, 1000)
        try:
            catkey = predarg.clean(True)
            template = FunctorTemplate.create_from_category(predarg)
            if template is None:
                continue

            if verify:
                f = template.create_empty_functor()
                U1 = f.get_unify_scopes(False)
                U2 = f.category.extract_unify_atoms(False)
                if len(U1) != len(U2):
                    assert False
                C1 = f.category
                C2 = template.predarg_category.clean(True)
                if not C1.can_unify(C2):
                    assert False

            if catkey.signature not in fn_dict:
                fn_dict[catkey.signature] = template
            elif verify:
                f1 = fn_dict[catkey.signature]
                t1 = str(f1)
                t2 = str(template)
                assert t1 == t2, 'verify failed\n  t1=%s\n  t2=%s\n  f1=%s\n  f2=%s' % (t1, t2, f1.predarg_category, predarg)
        except Exception as e:
            failed_rules.append(safe_utf8_encode('%s: %s' % (predarg, e)))
            # DEBUG ?
            if False:
                try:
                    FunctorTemplate.create_from_category(predarg)
                except Exception:
                    pass

    print_progress(progress, done=True)

    if len(failed_rules) != 0:
        print('Warning: model - %d rules failed' % len(failed_rules))
        with open(os.path.join(outdir, 'functor_easysrl_templates_failed.dat'), 'w') as fd:
            fd.write(b'\n'.join(failed_rules))
        if verbose:
            for m in failed_rules:
                print(m)

    return fn_dict
Ejemplo n.º 6
0
    def make_s3_name(cls, text):
        global _ALPHANUM
        text = text.lower()
        if future_string == unicode:
            return '-'.join(filter(lambda y: len(y) != 0, _NALNUMSP.sub('', text).split(' ')))

        if isinstance(text, unicode):
            text = text.encode('utf-8')
        result = '-'.join(filter(lambda y: len(y) != 0, _NALNUMSP.sub('', text).split(' ')))
        # PWG: don't know why this happens but if text contains unicode
        # it is converted automatically
        return safe_utf8_encode(result)
Ejemplo n.º 7
0
def strip_apostrophe_s(word):
    """Strip trailing 's from nouns.

    Args:
        word: An ascii or utf-8 string.

    Returns:
        The stripped word.
    """
    # Must support utf-8
    if len(word) > 2:
        if word.endswith("'s"):
            return word[0:-2]
        elif isinstance(word, unicode):
            if word.endswith(u"’s"):
                return word.replace(u"’s", u'')
        else:
            uword = safe_utf8_decode(word)
            if uword.endswith(u"’s"):
                return safe_utf8_encode(uword.replace(u"’s", u''))
    return word
Ejemplo n.º 8
0
def build_from_ldc_ccgbank(fn_dict, outdir, verbose=False, verify=True):
    print('Building function templates from LDC ccgbank...')

    allfiles = []
    ldcpath = os.path.join(projdir, 'data', 'ldc', 'ccgbank_1_1', 'data', 'AUTO')
    dirlist1 = os.listdir(ldcpath)
    for dir1 in dirlist1:
        ldcpath1 = os.path.join(ldcpath, dir1)
        if os.path.isdir(ldcpath1):
            dirlist2 = os.listdir(ldcpath1)
            for dir2 in dirlist2:
                ldcpath2 = os.path.join(ldcpath1, dir2)
                if os.path.isfile(ldcpath2):
                    allfiles.append(ldcpath2)

    failed_parse = []
    failed_rules = []
    rules = []
    progress = 0
    for fn in allfiles:
        progress = print_progress(progress, 10)
        with open(fn, 'r') as fd:
            lines = fd.readlines()
        for hdr,ccgbank in zip(lines[0::2], lines[1::2]):
            pt = None
            try:
                pt = parse_ccg_derivation(ccgbank)
                extract_predarg_categories_from_pt(pt, rules)
            except Exception as e:
                failed_parse.append(safe_utf8_encode('CCGBANK: ' + ccgbank.strip()))
                failed_parse.append(safe_utf8_encode('Error: %s' % e))
            # Now attempt to track undefined unary rules
            if pt is not None:
                try:
                    builder = Ccg2Drs()
                    builder.build_execution_sequence(pt)
                    # Calling this will track undefined
                    builder.get_predarg_ccgbank()
                except Exception as e:
                    pass

    progress = (progress / 10) * 1000
    for predarg in rules:
        progress = print_progress(progress, 1000)
        try:
            catkey = predarg.clean(True)
            template = FunctorTemplate.create_from_category(predarg)
            if template is None:
                continue
            if catkey.signature not in fn_dict:
                fn_dict[catkey.signature] = template
            elif verify:
                f1 = fn_dict[catkey.signature]
                t1 = future_string(f1)
                t2 = future_string(template)
                assert t1 == t2, 'verify failed\n  t1=%s\n  t2=%s\n  f1=%s\n  f2=%s' % (t1, t2, f1.predarg_category, predarg)
        except Exception as e:
            failed_rules.append(safe_utf8_encode('%s: %s' % (predarg, e)))
            # DEBUG ?
            if False:
                try:
                    FunctorTemplate.create_from_category(predarg)
                except Exception:
                    pass

    print_progress(progress, done=True)

    if len(failed_parse) != 0:
        print('Warning: ldc - %d parses failed' % (len(failed_parse)/2))
        with open(os.path.join(outdir, 'parse_ccg_derivation_failed.dat'), 'w') as fd:
            fd.write(b'\n'.join(failed_parse))
        if verbose:
            for x, m in failed_parse:
                print(m)

    if len(failed_rules) != 0:
        print('Warning: ldc - %d rules failed' % len(failed_rules))
        with open(os.path.join(outdir, 'functor_ldc_templates_failed.dat'), 'w') as fd:
            fd.write(b'\n'.join(failed_rules))
        if verbose:
            for m in failed_rules:
                print(m)

    return fn_dict
Ejemplo n.º 9
0
 def __str__(self):
     return safe_utf8_encode(self._get_str())
Ejemplo n.º 10
0
     if orphaned:
         sys.stdout.write('<orphaned>\n')
         sys.stdout.write(orphaned)
         sys.stdout.write('\n</orphaned>\n')
     if conjoins:
         sys.stdout.write('<conjoins>\n')
         sys.stdout.write(conjoins)
         sys.stdout.write('\n</conjoins>\n')
     if functor_phrases:
         sys.stdout.write('<functor_phrases>\n')
         sys.stdout.write(functor_phrases)
         sys.stdout.write('\n</functor_phrases>\n')
 else:
     with open(outfile, 'w') as fd:
         if html:
             fd.write(safe_utf8_encode(html))
             fd.write(b'\n')
         if ccg:
             fd.write(b'<ccg>\n')
             fd.write(safe_utf8_encode(ccg.strip()))
             fd.write(b'\n</ccg>\n')
         if pccg:
             fd.write(b'<predarg>\n')
             fd.write(safe_utf8_encode(pccg))
             fd.write(b'\n</predarg>\n')
         if drs:
             fd.write(b'<drs>\n')
             fd.write(drs)
             fd.write(b'\n</drs>\n')
         if fol:
             fd.write(b'<fol>\n')
Ejemplo n.º 11
0
                           'AUTO')
    outpath = os.path.join(projdir, 'data', 'ldc', 'mapping')
    if not os.path.exists(outpath):
        os.makedirs(outpath)
    dirlist1 = os.listdir(ldcpath)
    for dir1 in dirlist1:
        ldcpath1 = os.path.join(ldcpath, dir1)
        if os.path.isdir(ldcpath1):
            dirlist2 = os.listdir(ldcpath1)
            mapping = []
            for dir2 in dirlist2:
                ldcpath2 = os.path.join(ldcpath1, dir2)
                wsjnm, _ = os.path.splitext(dir2)
                if os.path.isfile(ldcpath2):
                    id = 1
                    missing = False
                    with open(ldcpath2, 'r') as fd:
                        lines = fd.readlines()
                        for hdr, ccgbank in zip(lines[0::2], lines[1::2]):
                            hdrid = hdr.split(' ')[0][3:].strip()
                            expected_hdrid = '%s.%d' % (wsjnm, id)
                            if not missing and hdrid != expected_hdrid:
                                missing = True
                                print('missing entry, expected %s, actual %s' %
                                      (expected_hdrid, hdrid))
                            mapping.append(safe_utf8_encode(hdrid))
                            id += 1
            with open(os.path.join(outpath, 'ccg_map%s.txt' % dir1),
                      'w') as fd:
                fd.write(b'\n'.join(mapping))
Ejemplo n.º 12
0
def make_lexicon(daemon):
    global pypath, projdir, datapath, idsrch
    allfiles = []
    projdir = os.path.dirname(os.path.dirname(__file__))

    easysrl_path = os.path.join(projdir, 'data', 'ldc', daemon, 'lexicon')
    if not os.path.exists(easysrl_path):
        os.makedirs(easysrl_path)
    if not os.path.exists(os.path.join(easysrl_path, 'rt')):
        os.makedirs(os.path.join(easysrl_path, 'rt'))
    if not os.path.exists(os.path.join(easysrl_path, 'az')):
        os.makedirs(os.path.join(easysrl_path, 'az'))

    # Get files
    ldcpath = os.path.join(projdir, 'data', 'ldc', daemon, 'ccgbank')
    dirlist1 = sorted(os.listdir(ldcpath))
    #dirlist1 = ['ccg_derivation00.txt']
    for fname in dirlist1:
        if 'ccg_derivation' not in fname:
            continue
        ldcpath1 = os.path.join(ldcpath, fname)
        if os.path.isfile(ldcpath1):
            allfiles.append(ldcpath1)

    failed_parse = 0
    failed_ccg_derivation = []
    start = 0
    progress = -1
    dictionary = None
    for fn in allfiles:
        idx = idsrch.match(fn)
        if idx is None:
            continue
        idx = idx.group('id')

        with open(fn, 'r') as fd:
            lines = fd.readlines()

        name, _ = os.path.splitext(os.path.basename(fn))
        for i in range(start, len(lines)):
            start = 0
            ccgbank = lines[i].strip()
            if len(ccgbank) == 0 or ccgbank[0] == '#':
                continue

            if progress < 0:
                print('%s-%04d' % (name, i))
            else:
                progress = print_progress(progress, 10)

            try:
                # CCG parser is Java so output is UTF-8.
                ccgbank = safe_utf8_decode(ccgbank)
                pt = parse_ccg_derivation(ccgbank)
                s = sentence_from_pt(pt).strip()
            except Exception:
                failed_parse += 1
                raise
                continue

            uid = '%s-%04d' % (idx, i)
            try:
                #dictionary[0-25][stem][set([c]), set(uid)]
                dictionary = extract_lexicon_from_pt(pt, dictionary, uid=uid)
            except Exception as e:
                print(e)
                raise
                continue

    rtdict = {}
    for idx in range(len(dictionary)):
        fname = unichr(idx + 0x40)
        filepath = os.path.join(easysrl_path, 'az', fname + '.txt')
        with open(filepath, 'w') as fd:
            d = dictionary[idx]
            for k, v in d.iteritems():
                # k == stem, v = {c: set(uid)}
                fd.write(b'<predicate name=\'%s\'>\n' % safe_utf8_encode(k))
                for x, w in v.iteritems():
                    fd.write(b'<usage \'%s\'>\n' % safe_utf8_encode(x))
                    nc = x.split(':')
                    if len(nc) == 2:
                        c = Category.from_cache(
                            Category(nc[1].strip()).clean(True))
                        # Return type atom
                        rt = c.extract_unify_atoms(False)[-1]
                        if rt in rtdict:
                            cdict = rtdict[rt]
                            if c in cdict:
                                cdict[c].append(nc[0])
                            else:
                                cdict[c] = [nc[0]]
                        else:
                            rtdict[rt] = {c: [nc[0]]}
                    for y in w:
                        fd.write(b'sentence id: ' + safe_utf8_encode(y))
                        fd.write(b'\n')
                    fd.write(b'</usage>\n')
                fd.write(b'</predicate>\n\n')
            # Free up memory
            dictionary[idx] = None
            d = None
    for rt, cdict in rtdict.iteritems():
        fname = rt.signature.replace('[', '_').replace(']', '')
        filepath = os.path.join(easysrl_path, 'rt', fname + '.txt')
        with open(filepath, 'w') as fd:
            for c, vs in cdict.iteritems():
                fd.write(b'<category signature=\'%s\'>\n' %
                         safe_utf8_encode(c))
                for v in vs:
                    fd.write(v)
                    fd.write(b'\n')
                fd.write(b'</category>\n\n')
Ejemplo n.º 13
0
 def __repr__(self):
     if self.drs:
         return b'<Lexeme>:(%s, %s, %s)' % (safe_utf8_encode(
             self.word), self.drs, self.category)
     return b'<Lexeme>:(%s, %s, %s)' % (safe_utf8_encode(
         self.word), self.stem, self.category)
Ejemplo n.º 14
0
 def __str__(self):
     return safe_utf8_encode(self.to_string())
Ejemplo n.º 15
0
    def get_aws_s3_names(self, article_text):
        """Get the s3 name for the article.

        Returns:
            A tuple containing the s3 bucket and object-name for the article.
        """
        # FIXME: move to __future__
        global _DOM, _ALPHANUM
        m = _DOM.match(self.entry.link)
        assert m is not None
        if future_string == unicode:
            dom = m.group('domain').replace('.', '-')
            name = self.make_s3_name(self.entry.title)
            dt = self.get_date()
            dtYM = '{:%Y-%m}'.format(dt)
            dtD  = '{:%d}'.format(dt)[::-1]
            h = hashlib.md5()
            language = self.feed.language.lower() if hasattr(self.feed, 'language') else 'en-us'
            h.update(safe_utf8_encode(language))
            h.update(safe_utf8_encode(dom))
            h.update(safe_utf8_encode(name))
            h.update(safe_utf8_encode(article_text))
            h = h.hexdigest()
            feedtitle = self.make_s3_name(self.feed.title) if hasattr(self.feed, 'title') else 'unknown'
            return 'marbles-ai-feeds-%s-%s' % (language, dtYM), '%s/%s/%s/%s/%s' % (dtD, dom, feedtitle, name, h)

        dom = safe_utf8_encode(m.group('domain').replace('.', '-'))
        name = self.make_s3_name(self.entry.title)
        dt = self.get_date()
        dtYM = safe_utf8_encode('{:%Y-%m}'.format(dt))
        dtD  = safe_utf8_encode('{:%d}'.format(dt)[::-1])
        h = safe_utf8_encode(hashlib.md5())
        article_text = safe_utf8_encode(article_text)
        name = safe_utf8_encode(name)
        # FIXME: use geo-location on domain to infer language
        language = safe_utf8_encode(self.feed.language.lower()) if hasattr(self.feed, 'language') else 'en-us'
        h.update(language)
        h.update(dom)
        h.update(name)
        h.update(article_text)
        h = h.hexdigest()
        feedtitle = self.make_s3_name(self.feed.title) if hasattr(self.feed, 'title') else 'unknown'
        return 'marbles-ai-feeds-%s-%s' % (language, dtYM), '%s/%s/%s/%s/%s' % (dtD, dom, feedtitle, name, h)
Ejemplo n.º 16
0
def make_drs(daemon):
    global pypath, projdir, datapath, idsrch
    allfiles = []
    projdir = os.path.dirname(os.path.dirname(__file__))

    easysrl_path = os.path.join(projdir, 'data', 'ldc', daemon, 'drs')
    if not os.path.exists(easysrl_path):
        os.makedirs(easysrl_path)

    # Get files
    ldcpath = os.path.join(projdir, 'data', 'ldc', daemon, 'ccgbank')
    dirlist1 = os.listdir(ldcpath)
    for fname in dirlist1:
        if 'ccg_derivation' not in fname:
            continue
        ldcpath1 = os.path.join(ldcpath, fname)
        if os.path.isfile(ldcpath1):
            allfiles.append(ldcpath1)

    failed_parse = 0
    failed_ccg2drs = []
    start = 0
    progress = -1
    for fn in allfiles:
        idx = idsrch.match(fn)
        if idx is None:
            continue
        idx = idx.group('id')

        if not os.path.exists(os.path.join(easysrl_path, idx)):
            os.mkdir(os.path.join(easysrl_path, idx))

        with open(fn, 'r') as fd:
            lines = fd.readlines()

        name, _ = os.path.splitext(os.path.basename(fn))
        for i in range(start, len(lines)):
            start = 0
            ccgbank = lines[i].strip()
            if len(ccgbank) == 0 or ccgbank[0] == '#':
                continue

            if progress < 0:
                print('%s-%04d' % (name, i))
            else:
                progress = print_progress(progress, 10)

            try:
                # CCG parser is Java so output is UTF-8.
                pt = parse_ccg_derivation(ccgbank)
                s = sentence_from_pt(pt).strip()
                pccg = pt_to_ccg_derivation(pt)
            except Exception:
                failed_parse += 1
                raise
                continue

            try:
                d = process_ccg_pt(
                    pt, CO_VERIFY_SIGNATURES | CO_NO_VERBNET
                    | CO_NO_WIKI_SEARCH).get_drs()
                assert d is not None
                assert isinstance(d, DRS)
                d = d.show(SHOW_LINEAR).strip()
            except Exception as e:
                print(e)
                failed_ccg2drs.append((name, i, ccgbank))
                raise
                continue

            with open(
                    os.path.join(easysrl_path, idx,
                                 'drs_%s_%04d.dat' % (idx, i)), 'w') as fd:
                fd.write(b'<sentence>\n')
                fd.write(safe_utf8_encode(s))
                fd.write(b'\n</sentence>\n<drs>\n')
                fd.write(safe_utf8_encode(d))
                fd.write(b'\n</drs>\n<predarg>\n')
                fd.write(safe_utf8_encode(pccg))
                fd.write(b'\n')
                fd.write(b'</predarg>\n')

    if failed_parse != 0:
        print('%d derivations failed to parse' % failed_parse)
    if len(failed_ccg2drs) != 0:
        print('%d derivations failed to convert to DRS' % len(failed_ccg2drs))
        for x in failed_ccg2drs:
            print('%s-%04d failed: {%s}' % x)
Ejemplo n.º 17
0
    # Use CCGBANK as our corpus
    for fname in dirlist:
        print(fname)
        ldcpath1 = os.path.join(ldcpath, fname)

        m = idsrch.match(os.path.basename(ldcpath1))
        if m is None:
            continue

        with open(ldcpath1, 'r') as fp:
            stats = spellchecker.build_from_corpus(fp, stats)

    # Iterate wordnet
    strm = StringIO.StringIO()
    for ss in wn.all_synsets():
        ln = ' '.join(ss.lemma_names())
        strm.write(safe_utf8_encode(ln))
        strm.write(b'\n')
    strm.seek(0)
    stats = spellchecker.build_from_corpus(strm, stats)

    print("total words processed: %i" % stats[0])
    print("total unique words in corpus: %i" % stats[1])
    print("total items in dictionary (corpus words and deletions): %i" % len(spellchecker.dictionary))
    print("  edit distance for deletions: %i" % spellchecker.max_edit_distance)
    print("  length of longest word in corpus: %i" % spellchecker.longest_word_length)
    with open(os.path.join(pypath, 'marbles', 'ie', 'kb', 'data', 'dictionary-en.dat'), 'w') as fp:
        spellchecker.save(fp)