Python open Exemples, smart_open.open Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : test_gcs.py Projet : dustymugs/smart_open

    def test_buffered_writer_wrapper_works(self):
        """
        Ensure that we can wrap a smart_open gcs stream in a BufferedWriter, which
        passes a memoryview object to the underlying stream in python >= 2.7
        """
        expected = u'не думай о секундах свысока'

        with smart_open.gcs.Writer(BUCKET_NAME, WRITE_BLOB_NAME) as fout:
            with io.BufferedWriter(fout) as sub_out:
                sub_out.write(expected.encode('utf-8'))

        with smart_open.open("gs://{}/{}".format(BUCKET_NAME, WRITE_BLOB_NAME),
                             'rb') as fin:
            with io.TextIOWrapper(fin, encoding='utf-8') as text:
                actual = text.read()

        self.assertEqual(expected, actual)

Exemple #2

0

Afficher le fichier

Fichier : publish_json.py Projet : mozilla-tw/bigquery-etl

    def _publish_last_updated(self):
        """Write the timestamp when file of the dataset were last modified to GCS."""
        last_updated_path = (f"api/{self.api_version}/tables/{self.dataset}/"
                             f"{self.table}/{self.version}/last_updated")
        output_file = f"gs://{self.target_bucket}/{last_updated_path}"

        logging.info(f"Write last_updated to {output_file}")

        with smart_open.open(output_file, "w") as fout:
            last_updated = self.last_updated.strftime("%Y-%m-%d %H:%M:%S")
            fout.write(json.dumps(last_updated))

        # set Content-Type to json so that timestamp is displayed in the browser
        blob = self.storage_client.get_bucket(
            self.target_bucket).get_blob(last_updated_path)
        blob.content_type = "application/json"
        blob.patch()

Exemple #3

0

Afficher le fichier

def transit_groupby_csv(session,
                        params,
                        s3_filename,
                        key,
                        agg,
                        transport_params,
                        chunk_size=1e6):
    """
    ###  https://maxhalford.github.io/blog/streaming-groupbys-in-pandas-for-big-datasets/ ###

    pandas function to reduce memory usage. Data needs to be SORTED by a key and is processed by chunk_size batches.
    this can lead to hanging or orphan keys which is also handled.

    In our case data is grouped by lrimoshipno and sorted by movementdatetime


    :param session: boto3 session to access athena data
    :param params: boto3 parameter dictionary
    :param s3_filename: file to process after running get_data
    :param key: data has to be sorted by this key
    :param agg: data processing function
    :param chunk_size: number of rows per batch for processing
    :return: dataframe with potential port coordinates
    """
    chunks = pd.read_csv(open('s3://' + params['bucket'] + '/' +
                              params['path'] + '/' + s3_filename,
                              transport_params=transport_params),
                         chunksize=chunk_size,
                         parse_dates=['movementdatetime'])
    results = []
    orphans = pd.DataFrame()
    for chunk in tqdm(chunks):
        # Add the previous orphans to the chunk
        chunk = pd.concat((orphans, chunk))

        # Determine which rows are orphans
        last_val = chunk[key].iloc[-1]
        is_orphan = chunk[key] == last_val

        # Put the new orphans aside
        chunk, orphans = chunk[~is_orphan], chunk[is_orphan]

        # Perform the aggregation and store the results
        result = agg(chunk)
        results.append(result)
    return pd.concat(results)

Exemple #4

0

Afficher le fichier

Fichier : extract.py Projet : thehoff/scraperx

    def run(self):
        """Starts extracting data from the source files

        Loops over each source file passing it to the users scrapers `self.extract` method.
        Passing in the source files raw content

        If all the sources need to be passed in and extracted at the same time, then the user may
        override this method to do so.
        """
        logger.info("Start Extract",
                    extra={'task': self.task,
                           **self.scraper.log_extras(),
                           'time_started': self.time_extracted,
                           })

        for source_idx, source_file in enumerate(self._get_sources()):
            raw_source = None
            transport_params = {}
            if source_file.startswith('s3://'):
                transport_params = _get_s3_params(self.scraper,
                                                  context_type='downloader')

            with open(source_file, 'r', transport_params=transport_params) as f:
                raw_source = f.read()

            try:
                extraction_tasks = self._get_extraction_tasks(raw_source, source_idx)
                if not extraction_tasks:
                    continue

                for extraction_task in extraction_tasks:
                    extraction_task(raw_source)

            except Exception as e:
                logger.exception(f"Extraction Failed: {e}",
                                 extra={'task': self.task,
                                        'source_file': source_file,
                                        **self.scraper.log_extras(),
                                        **get_root_exc_log_overides(),
                                        })

        logger.debug('Extract finished',
                     extra={'task': self.task,
                            **self.scraper.log_extras(),
                            'time_finished': datetime.datetime.utcnow().isoformat() + 'Z',
                            })

Exemple #5

0

Afficher le fichier

Fichier : test_gcs.py Projet : traboukos/smart_open

def test_gcs_performance_small_reads(benchmark):
    initialize_bucket()

    ONE_MIB = 1024**2
    one_megabyte_of_msgs = io.BytesIO()
    msg = b'\x0f' + b'0123456789abcde'  # a length-prefixed "message"
    for _ in range(0, ONE_MIB, len(msg)):
        one_megabyte_of_msgs.write(msg)
    one_megabyte_of_msgs = one_megabyte_of_msgs.getvalue()

    key = _GCS_URL + '/many_reads_performance.bin'

    with smart_open.open(key, 'wb') as fout:
        fout.write(one_megabyte_of_msgs)

    actual = benchmark(read_length_prefixed_messages, key, 'rb', buffering=ONE_MIB)
    assert actual == one_megabyte_of_msgs

Exemple #6

0

Afficher le fichier

Fichier : handler.py Projet : jmzcray/aws_s3_log_ingestion_lambda

async def _fetch_data_from_s3(bucket, key, context):
    """
        Stream data from S3 bucket. Create batches of size MAX_PAYLOAD_SIZE
        and create async requests from batches
    """
    log_file_size = boto3.resource('s3').Bucket(
        bucket).Object(key).content_length
    if log_file_size > MAX_FILE_SIZE:
        logger.error(
            "The log file uploaded to S3 is larger than the supported max size of 400MB")
        return

    s3MetaData = {
        "invoked_function_arn": context.invoked_function_arn,
        "s3_bucket_name": bucket
    }
    log_file_url = "s3://{}/{}".format(bucket, key)
    async with aiohttp.ClientSession() as session:
        log_batches = []
        batch_request = []
        batch_counter = 1
        log_batch_size = 0
        start = time.time()
        with open(log_file_url, encoding='utf-8') as log_lines:
            for index, log in enumerate(log_lines):
                log_batch_size += sys.getsizeof(log)
                if index % 500 == 0:
                    logger.debug(f"index: {index}")
                log_batches.append(log)
                if log_batch_size > (MAX_BATCH_SIZE * BATCH_SIZE_FACTOR):
                    logger.debug(f"sending batch: {batch_counter}")
                    data = {"context": s3MetaData, "entry": log_batches}
                    batch_request.append(create_log_payload_request(data, session))
                    if len(batch_request) >= REQUEST_BATCH_SIZE:
                        await asyncio.gather(*batch_request)
                        batch_request = []
                    log_batches = []
                    log_batch_size = 0
                    batch_counter += 1
        data = {"context": s3MetaData, "entry": log_batches}
        batch_request.append(create_log_payload_request(data, session))
        logger.info("Sending data to NR logs.....")
        output = await asyncio.gather(*batch_request)
        end = time.time()
        logger.debug(f"time elapsed to send to NR Logs: {end - start}")

Exemple #7

0

Afficher le fichier

Fichier : check_fastq.py Projet : ebi-ait/hca-ebi-wrangler-central

def main():

    parser = argparse.ArgumentParser()
    parser.add_argument('--uuids', help='.txt file with hca-util submission uuids (uuid only). 1 uuid per line')
    parser.add_argument('--num_reads', default= 1000, help='number of reads to test')

    args = parser.parse_args()

    # Check if path ends with / or not and retrieve bucket key
    bucket_name = 'hca-util-upload-area'

    uuids = pd.read_csv(args.uuids,header=None)
    uuids = list(uuids[0])

    for uuid in uuids:

        s3 = boto3.resource('s3')
        my_bucket = s3.Bucket(bucket_name)
        keys = ['s3://hca-util-upload-area/' + str(s3_object.key) for s3_object in my_bucket.objects.all()]
        filenames = [key for key in keys if uuid in key]
        filenames = filenames[1:]
	filenames = [file for file in filenames if '.fastq.gz' in file]

        my_dict = {}
        for filename in filenames:
            my_dict[filename] = {}
            with smart_open.open(filename) as f:
                count = 0
                len_seqs = []
                records = SeqIO.parse(f, 'fastq')
                for record in records:
                    if count < args.num_reads:
                        len_seqs.append(len(str(record.seq)))
                        count += 1
                    else:
                        break
                len_uniq = list(set(len_seqs))
                for uniq in len_uniq:
                    num = len_seqs.count(uniq)
                    my_dict[filename].update({uniq: num})

        data = pd.DataFrame.from_dict(my_dict, orient='index')
        out_file = uuid + "_read_lengths.txt"
        data.to_csv(out_file,sep="\t")
        print("Done processing uuid: %s" % (uuid))

Exemple #8

0

Afficher le fichier

    def __init__(self, filename, validate_file=False, limit=None):
        """
        filename: the vocabulary file.  It is a flat text file with one
            (normalized) token per line.  In addition, the file should also
            contain the special tokens <S>, </S>, <UNK> (case sensitive). Can be None.
        limit: process only the first <limit> words from the file; can be useful at inference
        (we assume the vocabulary is sorted by frequency).
        """
        self._id_to_word = []
        self._word_to_id = {}
        self._unk = -1
        self._bos = -1
        self._eos = -1

        if filename:
            vocab_source = open(filename, 'r')  # Loading vocabulary from file
        else:
            logging.info(
                "No vocabulary file provided; using special tokens only.")
            vocab_source = ["<S>", "</S>",
                            "<UNK>"]  # Creating a toy vocabulary ourselves
        idx = 0
        for line in vocab_source:
            word_name = line.strip()
            if word_name == '<S>':
                self._bos = idx
            elif word_name == '</S>':
                self._eos = idx
            elif word_name == '<UNK>':
                self._unk = idx
            if word_name == '!!!MAXTERMID':
                continue

            self._id_to_word.append(word_name)
            self._word_to_id[word_name] = idx
            idx += 1
            if idx == limit:
                break
        logging.info(
            f"We will cache the vocabulary of {len(self._id_to_word)} tokens.")
        # check to ensure file has special tokens
        if validate_file:
            if self._bos == -1 or self._eos == -1 or self._unk == -1:
                raise ValueError("Ensure the vocabulary file has "
                                 "<S>, </S>, <UNK> tokens")

Exemple #9

0

Afficher le fichier

def upload_from_url(url, s3_key, on_stream_opened=None):
    bucket = app.config['LOCH_S3_BUCKET']
    s3_url = build_s3_url(s3_key)
    with requests.get(url, stream=True) as response:
        if response.status_code != 200:
            app.logger.error(
                f'Received unexpected status code, aborting S3 upload '
                f'(status={response.status_code}, body={response.text}, key={s3_key} url={url})'
            )
            raise ConnectionError(
                f'Response {response.status_code}: {response.text}')
        if on_stream_opened:
            on_stream_opened(response.headers)
        try:
            s3_upload_args = {
                'ServerSideEncryption': app.config['LOCH_S3_ENCRYPTION']
            }
            if s3_url.endswith('.gz'):
                s3_upload_args.update({
                    'ContentEncoding': 'gzip',
                    'ContentType': 'text/plain',
                })
            session = get_session()
            # smart_open needs to be told to ignore the .gz extension, or it will smartly attempt to double-compress it.
            with smart_open.open(
                    s3_url,
                    'wb',
                    ignore_ext=True,
                    transport_params=dict(
                        session=session,
                        multipart_upload_kwargs=s3_upload_args),
            ) as s3_out:
                for chunk in response.iter_content(chunk_size=1024):
                    s3_out.write(chunk)
        except (ClientError, ConnectionError, ValueError) as e:
            app.logger.error(
                f'Error on S3 upload: source_url={url}, bucket={bucket}, key={s3_key}, error={e}'
            )
            raise e
    s3_response = get_client().head_object(Bucket=bucket, Key=s3_key)
    if s3_response:
        app.logger.info(
            f'S3 upload complete: source_url={url}, bucket={bucket}, key={s3_key}'
        )
        return s3_response

Exemple #10

0

Afficher le fichier

def find_secrets():
    # create empty dataframe
    df = pd.DataFrame(columns=[
        'cik', 'date', 'company_name', 'form_type', 'filename', 'total',
        'has_secret'
    ] + secret_list + ['has_protection'] + protect_list)
    progress = 0

    client = connect_s3()

    # iterate through files
    df_mf = pd.read_csv('master_file_list_subset_clean.csv')
    for i in tqdm(df_mf.index):
        f = df_mf.at[i, 'Filename'].replace('edgar/data/', '')
        with open(f's3://sec-filings-v2/{f}',
                  transport_params={'client': client}) as f_in:
            contents = f_in.read()

        sec_header, ten_k_body = text_preprocessing(contents)
        fyear = extract_fyear(sec_header)
        sic = extract_sic(sec_header)

        text_list = tokenize(ten_k_body)
        # search for the keywords
        counts_dict = get_count(text_list)
        counts_dict['cik'] = df_mf.at[i, 'CIK']
        counts_dict['date'] = df_mf.at[i, 'Date Filed']
        counts_dict['company_name'] = df_mf.at[i, 'Company Name']
        counts_dict['filename'] = f
        counts_dict['form_type'] = df_mf.at[i, 'Form Type']

        df = df.append(counts_dict, ignore_index=True)

        if progress % 100 == 0 and progress >= 100:
            df_temp = df.sort_values(by=['cik', 'date'])
            csv_to_s3(df_temp, client, f'keywords_temp_{progress}.csv')
            if progress != 100:
                client.delete_object(Bucket='10k-output',
                                     Key=f'keywords_temp_{progress-100}.csv')

        progress += 1

    df = df.sort_values(by=['cik', 'date'])
    csv_to_s3(df, client, f'keywords_final.csv')
    stop_ec2()

Exemple #11

0

Afficher le fichier

def read_file(fpath: str, **kwargs) -> str:
    """
    Read file with `smart_open` from file path.

    Parameters
    -----------
    fpath: str
        File path.
    kwargs: optional
        Other `smart_open` support params. 

    Returns
    --------
        data string of the file.
    """
    with smart_open.open(fpath, **kwargs) as f:
        data = f.read()
    return data

Exemple #12

0

Afficher le fichier

Fichier : citylink_cluster.py Projet : z-yin/CityLink

    def __iter__(self):
        jieba.enable_parallel(8)
        for filename in self.file_list:
            with open(self.root_path + filename, encoding='utf-8') as f:
                for line in f:
                    words = self._process(line)
                    if not words or len(
                            words
                    ) < 2:  # less than 2 words won't contain 2 cities
                        continue
                    words, cities = self._retrieve_cities(words)
                    # get unique cities
                    cities = list(set(cities))
                    if len(cities
                           ) < 2:  # less than 2 cities won't composite a link
                        continue
#                     yield {'words': self.dictionary.doc2bow(words), 'cities': cities}
                    yield {'words': words, 'cities': cities}

Exemple #13

0

Afficher le fichier

 def restore_spilled_objects(self, object_refs: List[ObjectRef],
                             url_with_offset_list: List[str]):
     for i in range(len(object_refs)):
         object_ref = object_refs[i]
         url_with_offset = url_with_offset_list[i].decode()
         # Retrieve the information needed.
         parsed_result = parse_url_with_offset(url_with_offset)
         base_url = parsed_result.base_url
         offset = parsed_result.offset
         # Read a part of the file and recover the object.
         with open(base_url, "rb") as f:
             f.seek(offset)
             metadata_len = int.from_bytes(f.read(8), byteorder="little")
             buf_len = int.from_bytes(f.read(8), byteorder="little")
             self._size_check(metadata_len, buf_len, parsed_result.size)
             metadata = f.read(metadata_len)
             # read remaining data to our buffer
             self._put_object_to_store(metadata, buf_len, f, object_ref)

Exemple #14

0

Afficher le fichier

 def process(self, something):
     clear_data = []
     with open(self.input_path) as fin:
         data = fin.read()
         products_list = ast.literal_eval(data)
         for prod in products_list:
             product_id = prod.get("id")
             product_name = prod.get("name")
             product_price = prod.get("price")
             created_at = prod.get("created_at")
             currency = prod.get("currency")
             clear_data.append([
                 product_id, product_name, product_price, currency,
                 created_at
             ])
     print(clear_data)
     logging.getLogger().setLevel(logging.INFO)
     yield clear_data

Exemple #15

0

Afficher le fichier

    def __build_sentences(corpus_path, data_path):
        sentences_path = '%s/sentences.txt' % data_path

        if not os.path.exists(sentences_path):
            # 读入语料并做分词，然后保存分词文件
            sentences = []
            with open(corpus_path, 'r') as corpus_file:
                with open(sentences_path, 'w') as sentences_file:
                    reader = csv.reader(corpus_file)
                    index = 0
                    for row in reader:
                        index += 1
                        sentences.append(' '.join(jieba.cut(row[1])))
                        if index % 2000 == 0:
                            sentences_file.write('\n'.join(sentences))
                            sentences.clear()

        return word2vec.LineSentence(smart_open.open(sentences_path))

Exemple #16

0

Afficher le fichier

Fichier : load_ddb.py Projet : roadmonster/cpsc5330-wq20

def add_tfidf_records(
        s3_filename='s3://hanks-bda-2020-01/data-output/tfidf/000000_0'):
    res = boto3.resource('dynamodb')
    table = res.Table('tfidf')
    create_table()
    with open(s3_filename, 'rb') as fin:
        i = 1
        for line in fin:
            strvalue = line.decode('utf-8').strip()
            doc_id, term, value = strvalue.split('\x01')
            table.put_item(Item={
                'term': term,
                'doc_id': doc_id,
                'value': Decimal(value)
            })
            if (i % 1000) == 0:
                print(f"I {i}")
            i += 1

Exemple #17

0

Afficher le fichier

def unpickle(fname):
    """Load object from `fname`, using smart_open so that `fname` can be on S3, HDFS, compressed etc.

    Parameters
    ----------
    fname : str
        Path to pickle file.

    Returns
    -------
    object
        Python object loaded from `fname`.

    """
    with open(fname, 'rb') as f:
        return _pickle.load(
            f, encoding='latin1'
        )  # needed because loading from S3 doesn't support readline()

Exemple #18

0

Afficher le fichier

Fichier : slingshot.py Projet : quadrismegistus/slingshot

def get_paths_from_csv(_fnfn,
                       path_key=PATH_KEY,
                       path_ext=PATH_EXT,
                       path_prefix='',
                       path_suffix='',
                       sep='\t'):
    paths = []
    #with codecs.open(_fnfn,encoding='utf-8') as pf:
    if not path_key: path_key = DEFAULT_PATH_KEY
    with open(_fnfn) as pf:
        reader = csv.DictReader(pf, delimiter=sep)
        for dx in reader:
            path = dx.get(path_key, '')
            if not path: continue
            if path_prefix: path = os.path.join(path_prefix, path)
            if path_suffix: path = path + path_suffix
            if path: paths += [path]
    return paths

Exemple #19

0

Afficher le fichier

Fichier : checkpoint.py Projet : jeffhsu3/mesh-transformer-jax

def parallel_read(old, fname):
    old_val, treedef = jax.tree_flatten(old)
    with open(fname, "rb") as f:
        buf = f.read()
        f_io = io.BytesIO(buf)
        loaded = np.load(f_io)

    new_vals = []
    for i in loaded:
        new_vals.append(loaded[i])

    for o, n in zip(new_vals, old_val):
        assert o.shape == n.shape, "Incompatible checkpoint"

        if o.dtype == np.dtype('V2'):
            o.dtype = jnp.bfloat16

    return jax.tree_unflatten(treedef, new_vals)

Exemple #20

0

Afficher le fichier

def open_s3(uri: str, *args: Any, **kwargs: Any) -> smart_open.open:
    """Stream an s3 key for read / write operations.

    This is a wrapper around smart_open.open which allows us to fine-tune
    access control for testing.
    """
    transport_params = {
        'resource_kwargs': {
            'endpoint_url': _ENDPOINT_URL,
        },
        'ExtraArgs': {
            'ServerSideEncryption': 'AES256'
        }
    }
    return smart_open.open(uri,
                           transport_params=transport_params,
                           *args,
                           **kwargs)

Exemple #21

0

Afficher le fichier

    def write_file_to_storage(self, records):
        #Get parameters to pass to the smart_open open function
        transport_params = {
            'session': self.session,
            'resource_kwargs': {
                'endpoint_url': self.endpoint_url,
            }
        }

        #Construct the storage URI
        storage_uri = 's3://%s/%s.tsv.gz' % (self.s3_bucket, self.s3_key)

        #Write records to S3
        with smart_open.open(storage_uri,
                             'w',
                             transport_params=transport_params) as fout:
            file_writer = csv.writer(fout, delimiter='\t', lineterminator='\n')
            file_writer.writerows(records)

Exemple #22

0

Afficher le fichier

 def output_samples(self, filename, n=None):
     with torch.no_grad():
         for name, samples in (('cat', self.categorical_samples),
                               ('cont', self.continuous_samples)):
             if samples is None:
                 continue
             grid_imgs = self.trainer.target_g(samples)
             grid_filename = os.path.join(
                 os.path.dirname(filename),
                 f'info_{name}_{os.path.basename(filename)}')
             nrow = samples.shape[1]
             with smart_open.open(grid_filename, 'wb') as output_file:
                 torchvision.utils.save_image(grid_imgs,
                                              output_file,
                                              nrow=nrow,
                                              normalize=True,
                                              range=(-1, 1),
                                              format='png')

Exemple #23

0

Afficher le fichier

Fichier : exporter.py Projet : pascoemitch/opennem

def wem_export_years():

    for year in [2020, 2019]:
        json_envelope = []

        energy = wem_energy_year(year)
        market_value = wem_market_value_year(year)

        json_envelope = energy + market_value

        year_path = BASE_EXPORT + f"/wem/energy/daily/{year}.json"

        with open(
                year_path,
                "w",
                transport_params=dict(multipart_upload_kwargs=UPLOAD_ARGS),
        ) as fh:
            json.dump(json_envelope, fh, cls=NemEncoder)

Exemple #24

0

Afficher le fichier

Fichier : __init__.py Projet : weslst/csvsorter

def mergesort(sorted_filenames, columns, nway=2, tmp_dir='', encoding='utf-8'):
    """Merge these 2 sorted csv files into a single output file
    """
    merge_n = 0
    while len(sorted_filenames) > 1:
        merge_filenames, sorted_filenames = sorted_filenames[:nway], sorted_filenames[nway:]

        output_filename = os.path.join(tmp_dir, 'merge{}.csv'.format(merge_n))
        with open(output_filename, 'w', newline='\n', encoding=encoding) as output_fp:
            writer = csv.writer(output_fp)
            merge_n += 1
            rows = (yield_csv_rows(filename, columns, encoding) for filename in merge_filenames)
            writer.writerows(heapq.merge(*rows))
        sorted_filenames.append(output_filename)

        for filename in merge_filenames:
            os.remove(filename)
    return sorted_filenames[0]

Exemple #25

0

Afficher le fichier

Fichier : deserialize_reader.py Projet : jennyzhang-petuum/forte

    def _collect(self, data_dir: str) -> Iterator[str]:  # type: ignore
        """
        This function will collect the files of the given directory. If the
         'suffix' field in the config is set, it will only take files matching
         that suffix. See :func:`~forte.data.readers.RecursiveDirectory
         DeserializeReader.default_configs` for the default configs.

        Args:
            data_dir: The root directory to search for the data packs.

        Returns: Iterator of the data pack string from the directory.
        """
        for root, _, files in os.walk(data_dir):
            for file in files:
                if not self.configs.suffix or file.endswith(
                        self.configs.suffix):
                    with open(os.path.join(root, file)) as f:
                        yield f.read()

Exemple #26

0

Afficher le fichier

def download_file(src: Union[str, "Pathy"],
                  dest: Path,
                  *,
                  force: bool = False) -> None:
    """Download a file using smart_open.
    url (str): The URL of the file.
    dest (Path): The destination path.
    force (bool): Whether to force download even if file exists.
        If False, the download will be skipped.
    """
    import smart_open

    if dest.exists() and not force:
        return None
    src = str(src)
    with smart_open.open(src, mode="rb", ignore_ext=True) as input_file:
        with dest.open(mode="wb") as output_file:
            output_file.write(input_file.read())

Exemple #27

0

Afficher le fichier

Fichier : test_gcs.py Projet : traboukos/smart_open

    def test_write_03b(self):
        """Does writing a last chunk size equal to a multiple of the min_part_size work?"""
        min_part_size = 256 * 1024
        smart_open_write = smart_open.gcs.Writer(
            BUCKET_NAME, WRITE_BLOB_NAME, min_part_size=min_part_size
        )
        expected = b"t" * min_part_size * 2

        with smart_open_write as fout:
            fout.write(expected)
            self.assertEqual(fout._current_part.tell(), 262144)
            self.assertEqual(fout._total_parts, 1)

        # read back the same key and check its content
        with smart_open.open("gs://{}/{}".format(BUCKET_NAME, WRITE_BLOB_NAME)) as fin:
            output = fin.read().encode('utf-8')

        self.assertEqual(output, expected)

Exemple #28

0

Afficher le fichier

    def _open(self, path_or_uri, mode):
        from smart_open import open

        if isinstance(path_or_uri, LocalGitFile):
            import git

            return io.BytesIO(
                git.Repo(path_or_uri.repo_path).git.show("{}:{}".format(
                    path_or_uri.ref, path_or_uri.path)).encode())

        if isinstance(path_or_uri, SourceFile):
            path_or_uri = path_or_uri.get_path_or_uri()

        try:
            return open(path_or_uri, mode)
        except Exception as e:
            raise WorkflowError(
                "Failed to open source file {}".format(path_or_uri), e)

Exemple #29

0

Afficher le fichier

Fichier : train.py Projet : eminentli/gretel-synthetics

def _train_tokenizer(store: BaseConfig) -> spm.SentencePieceProcessor:
    """
    Trains SentencePiece tokenizer on training data
    """
    logging.info("Training SentencePiece tokenizer")
    spm.SentencePieceTrainer.Train(
        input=store.training_data,
        model_prefix=store.tokenizer_prefix,
        user_defined_symbols=["<n>", store.field_delimiter_token],
        vocab_size=store.vocab_size,
        hard_vocab_limit=False,
        max_sentence_length=store.max_line_len,
        character_coverage=store.character_coverage)
    """
    spm.SentencePieceTrainer.Train(
        f'--input={store.training_data} '
        f'--model_prefix={store.tokenizer_prefix} '
        f'--user_defined_symbols=<n>,{store.field_delimiter_token} '
        f'--vocab_size={store.vocab_size} '
        f'--hard_vocab_limit=false '
        f'--character_coverage={store.character_coverage}')
    """
    _move_tokenizer_model(store)

    sp = spm.SentencePieceProcessor()
    logging.info(f"Loading tokenizer from: {Path(store.tokenizer_model).name}")
    sp.Load(store.tokenizer_model)

    # print sample output
    with open(store.training_data) as f:
        sample = f.readline().strip()
    logging.info(f"Tokenizer model vocabulary size: {len(sp)} tokens")
    logging.info(
        'Mapping first line of training data\n\n{}\n ---- sample tokens mapped to pieces ---- > \n{}\n'
        .format(repr(sample),
                ", ".join(sp.SampleEncodeAsPieces(sample, -1, 0.1))))
    logging.info(
        'Mapping first line of training data\n\n{}\n ---- sample tokens mapped to int ---- > \n{}\n'
        .format(repr(sample),
                ", ".join([str(idx) for idx in sp.EncodeAsIds(sample)])))
    logging.info(
        f"Saving SentencePiece model to {store.tokenizer_prefix}.model and {store.tokenizer_prefix}.vocab"
    )
    return sp

Exemple #30

0

Afficher le fichier

Fichier : data_helpers.py Projet : hallogameboy/Reformulation-Inference-Network

def load_data(node_id, node_emb, tq_emb, qc, qc_loc, FLAGS, filename):
    output_X, output_R, output_y = [], [], []
    n_fea = 2 * FLAGS.emb_dim + 2
    with open(filename, 'r') as fp:
        for line in fp:
            data = line.strip().split('\t')
            queries = [data[i] for i in range(1, len(data), 3)]
            embs = [
                get_tq_emb(node_id, node_emb, tq_emb, FLAGS, q)
                for q in queries
            ]
            # qembs = [ node_emb[node_id[('query', q)]] if ('query', q) in node_id else np.zeros(FLAGS.emb_dim) for q in queries]
            data = np.array([])
            for i in range(len(queries)):
                # data = np.append(data, [math.log10(i + 1)])
                # data = np.append(data, qembs[i])
                data = np.append(data, embs[i])
                data = np.append(
                    data, embs[i] -
                    embs[i - 1] if i > 0 else np.zeros(FLAGS.emb_dim))
            for i in range(1, len(queries)):
                if (queries[i - 1], queries[i]) not in qc_loc:
                    continue

                L = max(0, (i - FLAGS.max_len)) * n_fea
                R = i * n_fea
                X = data[L:R]
                if X.size < n_fea * FLAGS.max_len:
                    X = np.append(np.zeros(n_fea * FLAGS.max_len - X.size), X)
                assert (X.size == n_fea * FLAGS.max_len)

                for c in qc[queries[i - 1]]:
                    R = np.array([])
                    cemb = get_tq_emb(node_id, node_emb, tq_emb, FLAGS, c)
                    R = np.append(R, cemb)
                    R = np.append(R, cemb - embs[i - 1])

                    output_X.append(X)
                    output_R.append(R)
                    output_y.append(1.0 if c == queries[i] else 0.0)

    output_y = np.array(output_y).reshape((len(output_y), 1))

    return np.array(output_X), np.array(output_R), output_y