Example #1
0
def _qrun(queries, mmjar):
    out = {}

    # run one query at a time for more efficient caching
    for k, v in queries.iteritems():
        q = {k: v}
        tmp = tempfile.NamedTemporaryFile()
        qfn = tmp.name

        f = codecs.open(qfn, "w", encoding="utf-8")
        for qid, qtxt in q.iteritems():
            print >> f, "%s:%s" % (qid, qtxt)
        f.close()

        d = _run(qfn, mmjar, cache_file=cache_file(CACHEDIR,
                                                   [hash_file(qfn),
                                                    hash_file(mmjar)],
                                                   "metamap"))
        d = longest_concepts(d)

        for kk, vv in d.iteritems():
            if kk in out:
                print >> sys.stderr, "error: duplicate key:", kk
                print >> sys.stderr, out
                sys.exit(1)
            else:
                out[kk] = vv

        tmp.close()

    return out
Example #2
0
def _qrun(queries, mmjar):
    out = {}

    # run one query at a time for more efficient caching
    for k, v in queries.iteritems():
        q = {k: v}
        tmp = tempfile.NamedTemporaryFile()
        qfn = tmp.name

        f = codecs.open(qfn, "w", encoding="utf-8")
        for qid, qtxt in q.iteritems():
            print >> f, "%s:%s" % (qid, qtxt)
        f.close()

        d = _run(qfn,
                 mmjar,
                 cache_file=cache_file(
                     CACHEDIR,
                     [hash_file(qfn), hash_file(mmjar)], "metamap"))
        d = longest_concepts(d)

        for kk, vv in d.iteritems():
            if kk in out:
                print >> sys.stderr, "error: duplicate key:", kk
                print >> sys.stderr, out
                sys.exit(1)
            else:
                out[kk] = vv

        tmp.close()

    return out
Example #3
0
File: indexer.py Project: fellu/box
    def index(self):
        current_blobs = []

        for infile in os.listdir(self.box_path):
            if os.path.isdir(infile) or infile == '.meta':
                continue

            # Get file full path
            full_path = os.path.join(self.box_path, infile)
            checksum = hash_file(full_path)
            timestamp = get_file_timetamp(full_path)
            size = get_file_size(full_path)


            # Just to ease debugging.
            data = '%s:%s:%s:%s\n' % (checksum, infile, timestamp, size)

            blobpath = os.path.join(self.registry, checksum)

            # Append to current blobs
            current_blobs.append({
                'checksum': checksum,
                'filename': blobpath
            })

            # Write the blob if we don't already have it.
            if not os.path.isfile(blobpath):
                try:
                    # Todo, write much more information to the meta-blob file.
                    # Last modified, which server modified, ..
                    print("Writing blob %s (%s)" % (checksum, infile,))
                    blobfile = open(blobpath, 'w')
                    blobfile.write(data)
                    blobfile.close()
                except IOError:
                    raise Exception("Writing a blob failed")
            else:
                print("Blob %s exists" % checksum)

        # Clear orphaned files.
        for checksum in os.listdir(self.registry):
            blob_path = os.path.join(self.registry, checksum)
            content = open(blob_path, 'r').read()
            found = False

            for current in current_blobs:
                if current['checksum'] == checksum:
                    found = True
                    break
            if not found:
                print "Found obsolete %s blob" % checksum
                os.unlink(blob_path)
Example #4
0
def container_fileinfos(path):
    files = odict()
    with ZipFile(str(path), 'r') as zf:
        l = zf.filelist
        for subpath in l:
            r = odict()
            r['size'] = subpath.file_size
            #directories excluded
            if subpath.file_size > 0:
                with zf.open(subpath.filename, 'r') as f:
                    hash = hash_file(f);
                r[FileInfo.hash_key] = hash


            files[subpath.filename] = r
#    print(files)
    return files
Example #5
0
def _txtrun(text, mmjar, no_cache, long_concepts):

    if not text:
        print '[metamap info] empty query'
        return {'txt': {'concepts': []}}

#     tmp = tempfile.NamedTemporaryFile()
#     tfn = tmp.name
# 
#     with codecs.open(tfn, 'wb', encoding='utf-8') as f:
#         print >> f, u"txt:{0}".format(text)
    cf = cache_file(CACHEDIR, [hash_obj(text), hash_file(mmjar)], "metamap")

    # for some inesplicable reason, the Java MetaMap API client
    # throws a java.lang.StringIndexOutOfBoundsException exception
    # instead of gracefully returning nothing when a text has no concepts.
    try:
        d = _run(text, mmjar,  no_cache=no_cache, cache_file=cf)
    except ValueError:
        print '[metamap info] no concepts found'
        d = {'txt': {'concepts': []}}

    return d
Example #6
0
def _txtrun(text, mmjar, no_cache, long_concepts):

    if not text:
        print '[metamap info] empty query'
        return {'txt': {'concepts': []}}

#     tmp = tempfile.NamedTemporaryFile()
#     tfn = tmp.name
#
#     with codecs.open(tfn, 'wb', encoding='utf-8') as f:
#         print >> f, u"txt:{0}".format(text)
    cf = cache_file(CACHEDIR, [hash_obj(text), hash_file(mmjar)], "metamap")

    # for some inesplicable reason, the Java MetaMap API client
    # throws a java.lang.StringIndexOutOfBoundsException exception
    # instead of gracefully returning nothing when a text has no concepts.
    try:
        d = _run(text, mmjar, no_cache=no_cache, cache_file=cf)
    except ValueError:
        print '[metamap info] no concepts found'
        d = {'txt': {'concepts': []}}

    return d