def _qrun(queries, mmjar): out = {} # run one query at a time for more efficient caching for k, v in queries.iteritems(): q = {k: v} tmp = tempfile.NamedTemporaryFile() qfn = tmp.name f = codecs.open(qfn, "w", encoding="utf-8") for qid, qtxt in q.iteritems(): print >> f, "%s:%s" % (qid, qtxt) f.close() d = _run(qfn, mmjar, cache_file=cache_file(CACHEDIR, [hash_file(qfn), hash_file(mmjar)], "metamap")) d = longest_concepts(d) for kk, vv in d.iteritems(): if kk in out: print >> sys.stderr, "error: duplicate key:", kk print >> sys.stderr, out sys.exit(1) else: out[kk] = vv tmp.close() return out
def _qrun(queries, mmjar): out = {} # run one query at a time for more efficient caching for k, v in queries.iteritems(): q = {k: v} tmp = tempfile.NamedTemporaryFile() qfn = tmp.name f = codecs.open(qfn, "w", encoding="utf-8") for qid, qtxt in q.iteritems(): print >> f, "%s:%s" % (qid, qtxt) f.close() d = _run(qfn, mmjar, cache_file=cache_file( CACHEDIR, [hash_file(qfn), hash_file(mmjar)], "metamap")) d = longest_concepts(d) for kk, vv in d.iteritems(): if kk in out: print >> sys.stderr, "error: duplicate key:", kk print >> sys.stderr, out sys.exit(1) else: out[kk] = vv tmp.close() return out
def index(self): current_blobs = [] for infile in os.listdir(self.box_path): if os.path.isdir(infile) or infile == '.meta': continue # Get file full path full_path = os.path.join(self.box_path, infile) checksum = hash_file(full_path) timestamp = get_file_timetamp(full_path) size = get_file_size(full_path) # Just to ease debugging. data = '%s:%s:%s:%s\n' % (checksum, infile, timestamp, size) blobpath = os.path.join(self.registry, checksum) # Append to current blobs current_blobs.append({ 'checksum': checksum, 'filename': blobpath }) # Write the blob if we don't already have it. if not os.path.isfile(blobpath): try: # Todo, write much more information to the meta-blob file. # Last modified, which server modified, .. print("Writing blob %s (%s)" % (checksum, infile,)) blobfile = open(blobpath, 'w') blobfile.write(data) blobfile.close() except IOError: raise Exception("Writing a blob failed") else: print("Blob %s exists" % checksum) # Clear orphaned files. for checksum in os.listdir(self.registry): blob_path = os.path.join(self.registry, checksum) content = open(blob_path, 'r').read() found = False for current in current_blobs: if current['checksum'] == checksum: found = True break if not found: print "Found obsolete %s blob" % checksum os.unlink(blob_path)
def container_fileinfos(path): files = odict() with ZipFile(str(path), 'r') as zf: l = zf.filelist for subpath in l: r = odict() r['size'] = subpath.file_size #directories excluded if subpath.file_size > 0: with zf.open(subpath.filename, 'r') as f: hash = hash_file(f); r[FileInfo.hash_key] = hash files[subpath.filename] = r # print(files) return files
def _txtrun(text, mmjar, no_cache, long_concepts): if not text: print '[metamap info] empty query' return {'txt': {'concepts': []}} # tmp = tempfile.NamedTemporaryFile() # tfn = tmp.name # # with codecs.open(tfn, 'wb', encoding='utf-8') as f: # print >> f, u"txt:{0}".format(text) cf = cache_file(CACHEDIR, [hash_obj(text), hash_file(mmjar)], "metamap") # for some inesplicable reason, the Java MetaMap API client # throws a java.lang.StringIndexOutOfBoundsException exception # instead of gracefully returning nothing when a text has no concepts. try: d = _run(text, mmjar, no_cache=no_cache, cache_file=cf) except ValueError: print '[metamap info] no concepts found' d = {'txt': {'concepts': []}} return d