Esempio n. 1
0
def path_as_media_object(path):
    """Returns a MediaObject dict for the file at `path`

    :param path: path to a local file
    :returns: a MediaObject dict including a `sha256Digest` and other
      relevant fields
    :rtype: dict
    """
    if not os.path.isfile(path):
        return None
    return {
        '@type': 'MediaObject',
        'name': os.path.basename(path),
        'sha256Digest': hashu.sha256_file(path),
        'contentSize': readable_file_size(path),
        'dateCreated': isodate.as_utc_timestamp(os.path.getctime(path)),
        'dateModified': isodate.as_utc_timestamp(os.path.getmtime(path))
    }
Esempio n. 2
0
def test_parse_iso8601_dt_01():
    dt_str = '2020-01-17T23:18:45.431Z'
    dt = isodate.parse_iso8601_dt(dt_str)
    print('type', type(dt))
    assert type(dt) == datetime.datetime
    assert '2020-01-17T23:18:45.431000Z' == isodate.as_utc_timestamp(dt)
Esempio n. 3
0
def test_start_of_week_utc_timestamp_01():
    assert '1970-01-01T12:40:00.420000Z' == isodate.as_utc_timestamp(42000.42)
    assert '2007-12-05T13:42:42.000042Z' == isodate.as_utc_timestamp(
        datetime.datetime(2007, 12, 5, 13, 42, 42, 42))
Esempio n. 4
0
def load_tsv_vector_space(tsv_vecs_path, sep='\t'):
    """load the word embeddings file and create a vecspace dict
    that stores vectors with their correlated information and
    indices useful for searching the spece.

    :param tsv_vecs_path: path to upload the stored embeddings
    :type tsv_vecs_path: str
    :param sep: separator of the embeddings file
    :type sep: str
    :return: dictionary that contains the embeddings `labels`, the numpy array
    of word `vectors`, the created `faiss_index`, the `source` path
    of the embeddings and the number of embeddings dimensions `dim`
    :rtype: dict
    """
    labels = []
    vectors = []
    start = time.time()
    logger.info('Loading vectors from %s' % tsv_vecs_path)
    ndims = None
    with open(tsv_vecs_path, 'r', encoding='utf-8') as vecs_f:
        for line_idx, line in enumerate(vecs_f.readlines()):
            elems = line.split(sep)
            labels.append(elems[0])
            if ndims is None:
                ndims = len(elems[1:])
            msg = 'line %d, expecting %d dims, but %d' % (line_idx, ndims,
                                                          len(elems[1:]))
            assert ndims == len(elems[1:]), msg
            vectors.append(
                np.array(list(map(float, elems[1:])), dtype=np.float32))
        vectors = np.vstack(vectors)

        labels_set = set(labels)
        if len(labels_set) != len(labels):
            logger.warn("Repeated labels, %d vs %d" %
                        (len(labels), len(labels_set)))
        ndims = vectors.shape[1]
        assert ndims == ndims, '%d != %d' % (ndims, ndims)
    logger.info('Loaded %d vectors in %ds' % (len(labels),
                                              (time.time() - start)))
    nvectors = normalize(vectors)
    return {
        'labels': labels,
        'vectors': nvectors,
        'faiss_index': create_faiss_index(nvectors, ndims),
        'source': tsv_vecs_path,
        'dim': ndims,
        'dataset_info': {
            '@context':
            'http://schema.org',
            '@type':
            'Dataset',
            'name':
            'Co-inform Sentence embeddings',
            'identifier':
            hashu.sha256_file(tsv_vecs_path),
            'description':
            'Dataset of %d sentence embeddings extracted from claim reviews and articles collected as part of the Co-inform project'
            % len(labels),
            'dateCreated':
            isodate.as_utc_timestamp(os.path.getctime(tsv_vecs_path)),
            'dateModified':
            isodate.as_utc_timestamp(os.path.getmtime(tsv_vecs_path)),
            'creator':
            bot_describer.esiLab_organization(),
            'encoding': {
                '@type': 'MediaObject',
                'contentSize': bot_describer.readable_file_size(tsv_vecs_path),
                'encodingFormat': 'text/tab-separated-values'
            }
        }
    }