def path_as_media_object(path): """Returns a MediaObject dict for the file at `path` :param path: path to a local file :returns: a MediaObject dict including a `sha256Digest` and other relevant fields :rtype: dict """ if not os.path.isfile(path): return None return { '@type': 'MediaObject', 'name': os.path.basename(path), 'sha256Digest': hashu.sha256_file(path), 'contentSize': readable_file_size(path), 'dateCreated': isodate.as_utc_timestamp(os.path.getctime(path)), 'dateModified': isodate.as_utc_timestamp(os.path.getmtime(path)) }
def test_parse_iso8601_dt_01(): dt_str = '2020-01-17T23:18:45.431Z' dt = isodate.parse_iso8601_dt(dt_str) print('type', type(dt)) assert type(dt) == datetime.datetime assert '2020-01-17T23:18:45.431000Z' == isodate.as_utc_timestamp(dt)
def test_start_of_week_utc_timestamp_01(): assert '1970-01-01T12:40:00.420000Z' == isodate.as_utc_timestamp(42000.42) assert '2007-12-05T13:42:42.000042Z' == isodate.as_utc_timestamp( datetime.datetime(2007, 12, 5, 13, 42, 42, 42))
def load_tsv_vector_space(tsv_vecs_path, sep='\t'): """load the word embeddings file and create a vecspace dict that stores vectors with their correlated information and indices useful for searching the spece. :param tsv_vecs_path: path to upload the stored embeddings :type tsv_vecs_path: str :param sep: separator of the embeddings file :type sep: str :return: dictionary that contains the embeddings `labels`, the numpy array of word `vectors`, the created `faiss_index`, the `source` path of the embeddings and the number of embeddings dimensions `dim` :rtype: dict """ labels = [] vectors = [] start = time.time() logger.info('Loading vectors from %s' % tsv_vecs_path) ndims = None with open(tsv_vecs_path, 'r', encoding='utf-8') as vecs_f: for line_idx, line in enumerate(vecs_f.readlines()): elems = line.split(sep) labels.append(elems[0]) if ndims is None: ndims = len(elems[1:]) msg = 'line %d, expecting %d dims, but %d' % (line_idx, ndims, len(elems[1:])) assert ndims == len(elems[1:]), msg vectors.append( np.array(list(map(float, elems[1:])), dtype=np.float32)) vectors = np.vstack(vectors) labels_set = set(labels) if len(labels_set) != len(labels): logger.warn("Repeated labels, %d vs %d" % (len(labels), len(labels_set))) ndims = vectors.shape[1] assert ndims == ndims, '%d != %d' % (ndims, ndims) logger.info('Loaded %d vectors in %ds' % (len(labels), (time.time() - start))) nvectors = normalize(vectors) return { 'labels': labels, 'vectors': nvectors, 'faiss_index': create_faiss_index(nvectors, ndims), 'source': tsv_vecs_path, 'dim': ndims, 'dataset_info': { '@context': 'http://schema.org', '@type': 'Dataset', 'name': 'Co-inform Sentence embeddings', 'identifier': hashu.sha256_file(tsv_vecs_path), 'description': 'Dataset of %d sentence embeddings extracted from claim reviews and articles collected as part of the Co-inform project' % len(labels), 'dateCreated': isodate.as_utc_timestamp(os.path.getctime(tsv_vecs_path)), 'dateModified': isodate.as_utc_timestamp(os.path.getmtime(tsv_vecs_path)), 'creator': bot_describer.esiLab_organization(), 'encoding': { '@type': 'MediaObject', 'contentSize': bot_describer.readable_file_size(tsv_vecs_path), 'encodingFormat': 'text/tab-separated-values' } } }